diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..a460cd62cd93a1387e4ee2c75b1c921f90c9df7f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号4k壁纸.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号4k壁纸384.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号4k壁纸复杂.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号4k壁纸高清.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号4k壁纸精细.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号插画.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号水彩.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号素描.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上英文逗号油画.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上中文逗号.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上中文感叹号.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上中文句号.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上nega广告.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上nega广告符号.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_chinese/img/日出，海面上nega广告符号词汇.png filter=lfs diff=lfs merge=lfs -text
+fengshen/examples/stable_diffusion_dreambooth/duck_result.png filter=lfs diff=lfs merge=lfs -text
diff --git a/fengshen/API/main.py b/fengshen/API/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd4ac4c4cdad24a090de80ad2a5967d73d5f6099
--- /dev/null
+++ b/fengshen/API/main.py
@@ -0,0 +1,76 @@
+import uvicorn
+import click
+import argparse
+import json
+from importlib import import_module
+from fastapi import FastAPI, WebSocket
+from starlette.middleware.cors import CORSMiddleware
+from utils import user_config, api_logger, setup_logger, RequestDataStructure
+
+# 命令行启动时只输入一个参数，即配置文件的名字，eg: text_classification.json
+# 其余所有配置在该配置文件中设定，不在命令行中指定
+total_parser = argparse.ArgumentParser("API")
+total_parser.add_argument("config_path", type=str)
+args = total_parser.parse_args()
+
+# set up user config
+user_config.setup_config(args)
+
+# set up logger
+setup_logger(api_logger, user_config)
+
+# load pipeline 
+pipeline_class = getattr(import_module('fengshen.pipelines.' + user_config.pipeline_type), 'Pipeline')
+model_settings = user_config.model_settings
+model_args = argparse.Namespace(**model_settings)
+pipeline = pipeline_class(
+    args = model_args,
+    model = user_config.model_name
+    )
+
+
+# initialize app
+app = FastAPI(
+    title = user_config.PROJECT_NAME, 
+    openapi_url = f"{user_config.API_PREFIX_STR}/openapi.json"
+    )
+
+
+# api 
+# TODO 
+# 需要针对不同请求方法做不同判断，目前仅跑通了较通用的POST方法
+# POST方法可以完成大多数 输入文本-返回结果 的请求任务
+if(user_config.API_method == "POST"):
+    @app.post(user_config.API_path, tags = user_config.API_tags)
+    async def fengshen_post(data:RequestDataStructure):
+        # logging
+        api_logger.info(data.input_text)
+
+        input_text = data.input_text
+
+        result = pipeline(input_text)
+
+        return result
+else:
+    print("only support POST method")
+
+
+
+# Set all CORS enabled origins
+if user_config.BACKEND_CORS_ORIGINS:
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins = [str(origin) for origin in user_config.BACKEND_CORS_ORIGINS],
+        allow_credentials = user_config.allow_credentials,
+        allow_methods = user_config.allow_methods,
+        allow_headers = user_config.allow_headers,
+    )
+
+
+if __name__ == '__main__':
+
+    # 启动后可在浏览器打开 host:port/docs 查看接口的具体信息，并可进行简单测试
+    # eg: 127.0.0.1:8990/docs
+    uvicorn.run(app, host = user_config.SERVER_HOST, port = user_config.SERVER_PORT)
+     
+
diff --git a/fengshen/API/text_classification.json b/fengshen/API/text_classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..f510becb1306bd66faf2573dbd09044867c08efe
--- /dev/null
+++ b/fengshen/API/text_classification.json
@@ -0,0 +1,46 @@
+{
+    "SERVER": {
+        "SERVER_HOST": "127.0.0.1",
+        "SERVER_PORT": 8990,
+        "SERVER_NAME": "fengshen_demo",
+        "PROJECT_NAME": "fengshen_demo",
+        "API_PREFIX_STR": "/api",
+        
+        "API_method" : "POST",
+        "API_path": "/TextClassification",
+        "API_tags": ["TextClassification"],
+
+        "BACKEND_CORS_ORIGINS": ["*"],
+        "allow_credentials": true,
+        "allow_methods": ["*"],
+        "allow_headers": ["*"]
+        
+    },
+    "LOGGING": {
+        "log_file_path": "",
+        "log_level": "INFO"
+    },
+
+    "PIPELINE": {
+        "pipeline_type": "text_classification",
+        "model_name": "IDEA-CCNL/Erlangshen-Roberta-110M-Similarity",
+        "model_settings": {
+            "device": -1,
+            "texta_name": "sentence",
+            "textb_name": "sentence2",
+            "label_name": "label",
+            "max_length": 512,
+            "return_tensors": "pt",
+            "padding": "longest",
+            "truncation": true,
+            "skip_special_tokens": true,
+            "clean_up_tkenization_spaces": true,
+            
+            "skip_steps": 10,
+            "clip_guidance_scale": 7500,
+            "init_scale": 10
+        }
+    }
+}
+
+    
\ No newline at end of file
diff --git a/fengshen/API/utils.py b/fengshen/API/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b2b9a193b17612c1b77d0f170c593b307bc7f3
--- /dev/null
+++ b/fengshen/API/utils.py
@@ -0,0 +1,167 @@
+from dataclasses import dataclass, field
+import os
+import json
+import logging
+from argparse import Namespace
+from typing import List, Literal, Optional, Union
+from pydantic import AnyHttpUrl, BaseSettings, HttpUrl, validator, BaseModel
+
+
+CURRENT_DIR_PATH = os.path.dirname(os.path.abspath(__file__))
+
+# request body
+# 使用pydantic对请求中的body数据进行验证
+class RequestDataStructure(BaseModel):
+    input_text: List[str] = [""]
+    uuid: Optional[int]
+    
+    # parameters for text2image model
+    input_image: Optional[str]
+    skip_steps: Optional[int]
+    clip_guidance_scale: Optional[int]
+    init_scale: Optional[int]
+
+# API config
+@dataclass
+class APIConfig:
+
+    # server config
+    SERVER_HOST: AnyHttpUrl = "127.0.0.1"
+    SERVER_PORT: int = 8990
+    SERVER_NAME: str = ""
+    PROJECT_NAME: str = ""
+    API_PREFIX_STR: str = "/api"
+
+    # api config
+    API_method: Literal["POST","GET","PUT","OPTIONS","WEBSOCKET","PATCH","DELETE","TRACE","CONNECT"] = "POST"
+    API_path: str = "/TextClassification"
+    API_tags: List[str] = field(default_factory = lambda: [""])
+    
+    # CORS config
+    BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = field(default_factory = lambda: ["*"])
+    allow_credentials: bool = True
+    allow_methods: List[str] = field(default_factory = lambda: ["*"])
+    allow_headers: List[str] = field(default_factory = lambda: ["*"])
+    
+    # log config
+    log_file_path: str = ""
+    log_level: str = "INFO"
+        
+    # pipeline config
+    pipeline_type: str = ""
+    model_name: str = ""
+
+    # model config
+    # device: int = -1
+    # texta_name: Optional[str] = "sentence"
+    # textb_name: Optional[str] = "sentence2"
+    # label_name: Optional[str] = "label"
+    # max_length: int = 512
+    # return_tensors: str = "pt"
+    # padding: str = "longest"
+    # truncation: bool = True
+    # skip_special_tokens: bool = True
+    # clean_up_tkenization_spaces: bool = True
+
+    # # parameters for text2image model
+    # skip_steps: Optional[int] = 0
+    # clip_guidance_scale: Optional[int] = 0
+    # init_scale: Optional[int] = 0
+
+    def setup_config(self, args:Namespace) -> None:
+        
+        # load config file
+        with open(CURRENT_DIR_PATH + "/" + args.config_path, "r") as jsonfile:
+            config = json.load(jsonfile)
+
+        server_config = config["SERVER"]
+        logging_config = config["LOGGING"]
+        pipeline_config = config["PIPELINE"]
+        
+        # server config
+        self.SERVER_HOST: AnyHttpUrl = server_config["SERVER_HOST"]
+        self.SERVER_PORT: int = server_config["SERVER_PORT"]
+        self.SERVER_NAME: str = server_config["SERVER_NAME"]
+        self.PROJECT_NAME: str = server_config["PROJECT_NAME"]
+        self.API_PREFIX_STR: str = server_config["API_PREFIX_STR"]
+
+        # api config
+        self.API_method: Literal["POST","GET","PUT","OPTIONS","WEBSOCKET","PATCH","DELETE","TRACE","CONNECT"] = server_config["API_method"]
+        self.API_path: str = server_config["API_path"]
+        self.API_tags: List[str] = server_config["API_tags"]
+
+        # CORS config
+        self.BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = server_config["BACKEND_CORS_ORIGINS"]
+        self.allow_credentials: bool = server_config["allow_credentials"]
+        self.allow_methods: List[str] = server_config["allow_methods"]
+        self.allow_headers: List[str] = server_config["allow_headers"]
+        
+        # log config
+        self.log_file_path: str = logging_config["log_file_path"]
+        self.log_level: str = logging_config["log_level"]
+        
+        # pipeline config
+        self.pipeline_type: str = pipeline_config["pipeline_type"]
+        self.model_name: str = pipeline_config["model_name"]
+
+        # general model config
+        self.model_settings: dict = pipeline_config["model_settings"]
+
+        # 由于pipeline本身会解析参数，后续参数可以不要
+        # 直接将model_settings字典转为Namespace后作为pipeline的args参数即可
+
+        # self.device: int = self.model_settings["device"]
+        # self.texta_name: Optional[str] = self.model_settings["texta_name"]
+        # self.textb_name: Optional[str] = self.model_settings["textb_name"]
+        # self.label_name: Optional[str] = self.model_settings["label_name"]
+        # self.max_length: int = self.model_settings["max_length"]
+        # self.return_tensors: str = self.model_settings["return_tensors"]
+        # self.padding: str = self.model_settings["padding"]
+        # self.truncation: bool = self.model_settings["truncation"]
+        # self.skip_special_tokens: bool = self.model_settings["skip_special_tokens"]
+        # self.clean_up_tkenization_spaces: bool = self.model_settings["clean_up_tkenization_spaces"]
+
+        # # specific parameters for text2image model
+        # self.skip_steps: Optional[int] = self.model_settings["skip_steps"]
+        # self.clip_guidance_scale: Optional[int] = self.model_settings["clip_guidance_scale"]
+        # self.init_scale: Optional[int] = self.model_settings["init_scale"]
+        
+
+
+def setup_logger(logger, user_config: APIConfig):
+        
+        # default level: INFO 
+
+        logger.setLevel(getattr(logging, user_config.log_level, "INFO"))
+        ch = logging.StreamHandler()
+        
+        if(user_config.log_file_path == ""):
+            fh = logging.FileHandler(filename = CURRENT_DIR_PATH + "/"  + user_config.SERVER_NAME  + ".log")
+        elif(".log" not in user_config.log_file_path[-5:-1]):
+            fh = logging.FileHandler(filename = user_config.log_file_path + "/" + user_config.SERVER_NAME + ".log")
+        else:
+            fh = logging.FileHandler(filename = user_config.log_file_path)
+
+
+        formatter = logging.Formatter(
+            "%(asctime)s - %(module)s - %(funcName)s - line:%(lineno)d - %(levelname)s - %(message)s"
+        )
+
+        ch.setFormatter(formatter)
+        fh.setFormatter(formatter)
+        logger.addHandler(ch)  # Exporting logs to the screen
+        logger.addHandler(fh)  # Exporting logs to a file
+
+        return logger
+
+user_config = APIConfig()
+api_logger = logging.getLogger()
+
+
+
+
+
+
+
+
+
diff --git a/fengshen/README.md b/fengshen/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..45f7b3579c36a68f899a9a02cfcfbe1330d413d8
--- /dev/null
+++ b/fengshen/README.md
@@ -0,0 +1,105 @@
+## 最新发布
+
+* \[2022.09.13\] [更新ErLangShen系列DeBERTa预训练代码](https://huggingface.co./IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese)
+* \[2022.09.13\] [更新RanDeng系列Bart预训练代码](https://huggingface.co./IDEA-CCNL/Randeng-BART-139M)
+* \[2022.09.13\] [更新ErLangShen系列Bert预训练代码](https://huggingface.co./IDEA-CCNL/Erlangshen-MegatronBert-1.3B)
+* \[2022.05.11\] [更新TaiYi系列VIT多模态模型及下游任务示例](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/太乙系列/Taiyi-vit-87M-D.html)
+* \[2022.05.11\] [更新BiGan系列Transformer-XL去噪模型及下游任务示例](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/比干系列/Bigan-Transformer-XL-denoise-1.1B.html)
+* \[2022.05.11\] [更新ErLangShen系列下游任务示例](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/二郎神系列/Erlangshen-Roberta-110M-NLI.html)
+
+# 导航
+
+- [导航](#导航)
+  - [框架简介](#框架简介)
+  - [依赖环境](#依赖环境)
+  - [项目结构](#项目结构)
+  - [设计思路](#设计思路)
+  - [分类下游任务](#分类下游任务)
+
+## 框架简介
+
+FengShen训练框架是封神榜大模型开源计划的重要一环，在大模型的生产和应用中起到至关重要的作用。FengShen可以应用在基于海量数据的预训练以及各种下游任务的finetune中。封神榜专注于NLP大模型开源，然而模型的增大带来不仅仅是训练的问题，在使用上也存在诸多不便。为了解决训练和使用的问题，FengShen参考了目前开源的优秀方案并且重新设计了Pipeline，用户可以根据自己的需求，从封神榜中选取丰富的预训练模型，同时利用FengShen快速微调下游任务。
+
+目前所有实例以及文档可以查看我们的[Wiki](https://fengshenbang-doc.readthedocs.io/zh/latest/index.html)
+所有的模型可以在[Huggingface主页](https://huggingface.co./IDEA-CCNL)找到
+
+通过我们的框架，你可以快速享受到：
+
+1. 比原生torch更强的性能，训练速度提升<font color=#0000FF >**300%**</font>
+2. 支持更大的模型，支持<font color=#0000FF >**百亿级别**</font>内模型训练及微调
+3. 支持<font color=#0000FF >**TB级以上**</font>的数据集，在家用主机上即可享受预训练模型带来的效果提升
+3. 丰富的预训练、下游任务示例，一键开始训练
+4. 适应各种设备环境，支持在CPU、GPU、TPU等不同设备上运行
+5. 集成主流的分布式训练逻辑，无需修改代码即可支持DDP、Zero Optimizer等分布式优化技术
+
+![avartar](../pics/fengshen_pic.png)
+
+## 依赖环境
+
+* Python >= 3.8
+* torch >= 1.8
+* transformers >= 3.2.0
+* pytorch-lightning >= 1.5.10
+
+在Fengshenbang-LM根目录下
+pip install --editable ./
+
+## 项目结构
+
+```
+├── data                        # 支持多种数据处理方式以及数据集
+│   ├── cbart_dataloader
+|   ├── fs_datasets             # 基于transformers datasets的封装，新增中文数据集(开源计划中)
+|   ├── universal_datamodule    # 打通fs_datasets与lightning datamodule，减少重复开发工作量
+│   ├── megatron_dataloader     # 支持基于Megatron实现的TB级别数据集处理、训练
+│   ├── mmap_dataloader         # 通用的Memmap形式的数据加载
+│   └── task_dataloader         # 支持多种下游任务
+├── examples                    # 丰富的示例，从预训练到下游任务，应有尽有。
+├── metric                      # 提供各种metric计算，支持用户自定义metric
+├── losses                      # 同样支持loss自定义，满足定制化需求
+├── tokenizer                   # 支持自定义tokenizer，比如我们使用的SentencePiece训练代码等
+├── models                      # 模型库
+│   ├── auto                    # 支持自动导入对应的模型
+│   ├── bart
+│   ├── longformer
+│   ├── megatron_t5
+│   └── roformer
+└── utils                       # 实用函数
+```
+
+## 设计思路
+
+FengShen框架目前整体基于Pytorch-Lightning & Transformer进行开发，在底层框架上不断开源基于中文的预训练模型，同时提供丰富的examples，每一个封神榜的模型都能找到对应的预训练、下游任务代码。
+
+在FengShen上开发，整体可以按照下面的三个步骤进行：
+
+1. 封装数据处理流程 -> pytorch_lightning.LightningDataModule
+2. 封装模型结构 -> pytorch_lightning.LightningModule
+3. 配置一些插件，比如log_monitor，checkpoint_callback等等。
+
+一个完整的DEMO可以看Randeng-BART系列实例 -> [文档](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/燃灯系列/BART-139M.html) [代码](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/hf-ds/fengshen/examples/pretrain_bart)
+
+## 分类下游任务
+
+ 在examples/classification目录下，我们提供丰富的分类任务的示例，其中我们提供三个一键式运行的示例。
+
+* demo_classification_afqmc_roberta.sh              使用DDP微调roberta
+* demo_classification_afqmc_roberta_deepspeed.sh    结合deepspeed微调roberta，获得更快的运算速度
+* demo_classification_afqmc_erlangshen_offload.sh   仅需7G显存即可微调我们效果最好的二郎神系列模型
+
+ 上述示例均采用AFQMC的数据集，关于数据集的介绍可以在[这里](https://www.cluebenchmarks.com/introduce.html)找到。
+ 同时我们处理过的数据文件已经放在Huggingface上，点击[这里](https://huggingface.co./datasets/IDEA-CCNL/AFQMC)直达源文件。
+ 仅需要按我们的格式稍微处理一下数据集，即可适配下游不同的分类任务。
+ 在脚本示例中，仅需要修改如下参数即可适配本地文件
+
+ ```
+         --dataset_name IDEA-CCNL/AFQMC \
+
+ -------> 修改为
+
+         --data_dir $DATA_DIR \          # 数据目录
+         --train_data train.json \       # 数据文件
+         --valid_data dev.json \
+         --test_data test.json \
+
+ ```
diff --git a/fengshen/__init__.py b/fengshen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cc52d128218a4878e5778502e25eadf54cf1261
--- /dev/null
+++ b/fengshen/__init__.py
@@ -0,0 +1,19 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .models.longformer import LongformerConfig, LongformerModel
+from .models.roformer import RoFormerConfig, RoFormerModel
+from .models.megatron_t5 import T5Config, T5EncoderModel
+from .models.ubert import UbertPipelines, UbertModel
diff --git a/fengshen/cli/fengshen_pipeline.py b/fengshen/cli/fengshen_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c31349ef96fd86d0c14b807601c645b095372f
--- /dev/null
+++ b/fengshen/cli/fengshen_pipeline.py
@@ -0,0 +1,34 @@
+import sys
+from importlib import import_module
+from datasets import load_dataset
+import argparse
+
+
+def main():
+    if len(sys.argv) < 3:
+        raise Exception(
+            'args len < 3, example: fengshen_pipeline text_classification predict xxxxx')
+    pipeline_name = sys.argv[1]
+    method = sys.argv[2]
+    pipeline_class = getattr(import_module('fengshen.pipelines.' + pipeline_name), 'Pipeline')
+
+    total_parser = argparse.ArgumentParser("FengShen Pipeline")
+    total_parser.add_argument('--model', default='', type=str)
+    total_parser.add_argument('--datasets', default='', type=str)
+    total_parser.add_argument('--text', default='', type=str)
+    total_parser = pipeline_class.add_pipeline_specific_args(total_parser)
+    args = total_parser.parse_args(args=sys.argv[3:])
+    pipeline = pipeline_class(args=args, model=args.model)
+
+    if method == 'predict':
+        print(pipeline(args.text))
+    elif method == 'train':
+        datasets = load_dataset(args.datasets)
+        pipeline.train(datasets)
+    else:
+        raise Exception(
+            'cmd not support, now only support {predict, train}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/data/__init__.py b/fengshen/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb
--- /dev/null
+++ b/fengshen/data/__init__.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/fengshen/data/bert_dataloader/auto_split.sh b/fengshen/data/bert_dataloader/auto_split.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a0f66d01df8f1728e44d9deb1d37e0396c5143a
--- /dev/null
+++ b/fengshen/data/bert_dataloader/auto_split.sh
@@ -0,0 +1,10 @@
+files=`find $1 -type f -size +1024M`
+
+for p in $files
+do
+echo "processing $p"
+name=`basename $p .json`
+file=`dirname $p`
+split -a 2 -C 300M $p $file/$name- && ls|grep -E "(-[a-zA-Z]{2})" |xargs -n1 -i{} mv {} {}.json
+rm -f $p
+done
\ No newline at end of file
diff --git a/fengshen/data/bert_dataloader/load.py b/fengshen/data/bert_dataloader/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36ce8ae72b74e9fd006f087ee0810a306badd7e
--- /dev/null
+++ b/fengshen/data/bert_dataloader/load.py
@@ -0,0 +1,200 @@
+import os
+import re
+from pathlib import Path
+import glob
+from tqdm import tqdm
+from contextlib import ExitStack
+import datasets
+import multiprocessing
+from typing import cast, TextIO
+from itertools import chain
+import json
+from concurrent.futures import ProcessPoolExecutor
+from random import shuffle
+from pytorch_lightning import LightningDataModule
+from typing import Optional
+
+from torch.utils.data import DataLoader
+
+
+# _SPLIT_DATA_PATH = '/data1/datas/wudao_180g_split/test'
+_SPLIT_DATA_PATH = '/data1/datas/wudao_180g_split'
+_CACHE_SPLIT_DATA_PATH = '/data1/datas/wudao_180g_FSData'
+
+# feats = datasets.Features({"text": datasets.Value('string')})
+
+
+class BertDataGenerate(object):
+
+    def __init__(self,
+                 data_files=_SPLIT_DATA_PATH,
+                 save_path=_CACHE_SPLIT_DATA_PATH,
+                 train_test_validation='950,49,1',
+                 num_proc=1,
+                 cache=True):
+        self.data_files = Path(data_files)
+        if save_path:
+            self.save_path = Path(save_path)
+        else:
+            self.save_path = self.file_check(
+                Path(self.data_files.parent, self.data_files.name+'_FSDataset'),
+                'save')
+        self.num_proc = num_proc
+        self.cache = cache
+        self.split_idx = self.split_train_test_validation_index(train_test_validation)
+        if cache:
+            self.cache_path = self.file_check(
+                Path(self.save_path.parent, 'FSDataCache', self.data_files.name), 'cache')
+        else:
+            self.cache_path = None
+
+    @staticmethod
+    def file_check(path, path_type):
+        print(path)
+        if not path.exists():
+            path.mkdir(parents=True)
+        print(f"Since no {path_type} directory is specified, the program will automatically create it in {path} directory.")
+        return str(path)
+
+    @staticmethod
+    def split_train_test_validation_index(train_test_validation):
+        split_idx_ = [int(i) for i in train_test_validation.split(',')]
+        idx_dict = {
+            'train_rate': split_idx_[0]/sum(split_idx_),
+            'test_rate': split_idx_[1]/sum(split_idx_[1:])
+        }
+        return idx_dict
+
+    def process(self, index, path):
+        print('saving dataset shard {}'.format(index))
+
+        ds = (datasets.load_dataset('json', data_files=str(path),
+                                    cache_dir=self.cache_path,
+                                    features=None))
+        # ds = ds.map(self.cut_sent,input_columns='text')
+        # print(d)
+        # print('!!!',ds)
+        ds = ds['train'].train_test_split(train_size=self.split_idx['train_rate'])
+        ds_ = ds['test'].train_test_split(train_size=self.split_idx['test_rate'])
+        ds = datasets.DatasetDict({
+            'train': ds['train'],
+            'test': ds_['train'],
+            'validation': ds_['test']
+        })
+        # print('!!!!',ds)
+        ds.save_to_disk(Path(self.save_path, path.name))
+        return 'saving dataset shard {} done'.format(index)
+
+    def generate_cache_arrow(self) -> None:
+        '''
+        生成HF支持的缓存文件，加速后续的加载
+        '''
+        data_dict_paths = self.data_files.rglob('*')
+        p = ProcessPoolExecutor(max_workers=self.num_proc)
+        res = list()
+
+        for index, path in enumerate(data_dict_paths):
+            res.append(p.submit(self.process, index, path))
+
+        p.shutdown(wait=True)
+        for future in res:
+            print(future.result(), flush=True)
+
+
+def load_dataset(num_proc=4, **kargs):
+    cache_dict_paths = Path(_CACHE_SPLIT_DATA_PATH).glob('*')
+    ds = []
+    res = []
+    p = ProcessPoolExecutor(max_workers=num_proc)
+    for path in cache_dict_paths:
+        res.append(p.submit(datasets.load_from_disk,
+                            str(path), **kargs))
+
+    p.shutdown(wait=True)
+    for future in res:
+        ds.append(future.result())
+        # print(future.result())
+    train = []
+    test = []
+    validation = []
+    for ds_ in ds:
+        train.append(ds_['train'])
+        test.append(ds_['test'])
+        validation.append(ds_['validation'])
+    # ds = datasets.concatenate_datasets(ds)
+    # print(ds)
+    return datasets.DatasetDict({
+        'train': datasets.concatenate_datasets(train),
+        'test': datasets.concatenate_datasets(test),
+        'validation': datasets.concatenate_datasets(validation)
+    })
+
+
+class BertDataModule(LightningDataModule):
+    @ staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('Universal DataModule')
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_batchsize', default=32, type=int)
+        parser.add_argument('--val_batchsize', default=32, type=int)
+        parser.add_argument('--test_batchsize', default=32, type=int)
+        parser.add_argument('--datasets_name', type=str)
+        # parser.add_argument('--datasets_name', type=str)
+        parser.add_argument('--train_datasets_field', type=str, default='train')
+        parser.add_argument('--val_datasets_field', type=str, default='validation')
+        parser.add_argument('--test_datasets_field', type=str, default='test')
+        return parent_args
+
+    def __init__(
+        self,
+        tokenizer,
+        collate_fn,
+        args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.datasets = load_dataset(num_proc=args.num_workers)
+        self.tokenizer = tokenizer
+        self.collate_fn = collate_fn
+        self.save_hyperparameters(args)
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        self.train = DataLoader(
+            self.datasets[self.hparams.train_datasets_field],
+            batch_size=self.hparams.train_batchsize,
+            shuffle=True,
+            num_workers=self.hparams.num_workers,
+            collate_fn=self.collate_fn,
+        )
+        self.val = DataLoader(
+            self.datasets[self.hparams.val_datasets_field],
+            batch_size=self.hparams.val_batchsize,
+            shuffle=False,
+            num_workers=self.hparams.num_workers,
+            collate_fn=self.collate_fn,
+        )
+        self.test = DataLoader(
+            self.datasets[self.hparams.test_datasets_field],
+            batch_size=self.hparams.test_batchsize,
+            shuffle=False,
+            num_workers=self.hparams.num_workers,
+            collate_fn=self.collate_fn,
+        )
+        return
+
+    def train_dataloader(self):
+        return self.train
+
+    def val_dataloader(self):
+        return self.val
+
+    def test_dataloader(self):
+        return self.test
+
+
+if __name__ == '__main__':
+    # pre = PreProcessing(_SPLIT_DATA_PATH)
+    # pre.processing()
+
+    dataset = BertDataGenerate(_SPLIT_DATA_PATH, num_proc=16)
+    dataset.generate_cache_arrow()
diff --git a/fengshen/data/bert_dataloader/preprocessing.py b/fengshen/data/bert_dataloader/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c40e39a8122a5cc4ebd57b558f451c371f6066a3
--- /dev/null
+++ b/fengshen/data/bert_dataloader/preprocessing.py
@@ -0,0 +1,110 @@
+import re
+import json
+import multiprocessing
+from tqdm import tqdm
+from pathlib import Path
+from itertools import chain
+
+_SPLIT_DATA_PATH = '/data1/datas/wudao_180g'
+
+
+def cut_sent(path):
+    """
+    中文分句，默认？、。、！、省略号分句，考虑双引号包裹的句子
+    采用分割替换的方式
+    """
+    path = Path(path)
+    # print(path)
+    save_path = str(Path('/data1/datas/wudao_180g_split', path.name))
+    print('处理文件：', save_path)
+    with open(save_path, 'wt', encoding='utf-8') as w:
+        with open(path, 'rt', encoding='utf-8') as f:
+            for para in tqdm(f):
+                para = json.loads(para)
+                para_ = para['text'] + ' '
+                # print('sentence piece......')
+                # pep8中 正则不能些 \? 要写成\\?
+                para_ = re.sub('([？。！\\?\\!…]+)([^”’]|[”’])',
+                               r'\1#####\2', para_)
+                para_ = re.sub('([\\.]{3,})([^”’])', r'\1#####\2', para_)
+
+                # 匹配 \1: 句子结束符紧挨’”  \2: 非句子结束符号，被引号包裹的句子
+                para_ = re.sub(
+                    '([。！？\\?\\!…][”’])([^，。！？\\?\\!]|\\s)', r'\1#####\2', para_)
+                para_ = re.sub(
+                    '([\\.]{3,}[”’])([^，。！？\\?\\!]|\\s)', r'\1#####\2', para_)
+                para_ = re.sub(
+                    '([#]{5})([”’])([^，。！？\\?\\!])', r'\2#####\3', para_)
+                para_ = para_.strip()
+                # 一个512里面多个样本
+                line_ = ''
+                for line in para_.split('#####'):
+                    line = line.strip()
+                    if len(line_) < 512 and len(line) > 0:
+                        line_ += line
+                    else:
+                        w.writelines(json.dumps(
+                            {'text': line_}, ensure_ascii=False)+'\n')
+                        line_ = line
+                w.writelines(json.dumps(
+                    {'text': line_}, ensure_ascii=False)+'\n')
+
+
+def chain_iter(*filenames):
+    """
+    将多个文件读成一个迭代器
+    """
+    reader = [open(file, 'r') for file in filenames]
+    return chain(*reader)
+
+
+class Config(object):
+
+    def __init__(self, data_path=_SPLIT_DATA_PATH, num_worker=16, split_numb=600000, cut_sentence=True, output_file=None) -> None:
+        self.data_path = Path(data_path)
+        self.num_worker = num_worker
+        self.split_numb = split_numb
+        self.cut_sentence = cut_sentence
+
+
+def processing1():
+    args = Config()
+    p_ = [str(i) for i in args.data_path.glob('*')]
+    fin = chain_iter(*p_)
+    pool = multiprocessing.Pool(args.num_worker)
+    docs = pool.imap(cut_sent, fin, chunksize=args.num_worker)
+
+    if not Path(args.data_path.parent, args.data_path.name+'_split').exists():
+        Path(args.data_path.parent, args.data_path.name+'_split').mkdir()
+    writer = open(str(Path(args.data_path.parent, args.data_path.name +
+                  '_split', 'sentence_level.json')), 'wt', encoding='utf-8')
+    for doc in tqdm(docs):
+        for sentence in doc:
+            writer.writelines(json.dumps(
+                {"text": sentence}, ensure_ascii=False)+'\n')
+    pool.close()
+    pool.join()
+    writer.close()
+
+
+if __name__ == '__main__':
+    from time import process_time, perf_counter
+    from random import shuffle
+    st = process_time()
+    args = Config(num_worker=16)
+
+    if not Path(args.data_path.parent, args.data_path.name+'_split').exists():
+        Path(args.data_path.parent, args.data_path.name +
+             '_split').mkdir(parents=True)
+
+    p_ = [str(i) for i in args.data_path.glob('*')]
+    # 简单shuffle
+    shuffle(p_)
+
+    pool = multiprocessing.Pool(args.num_worker)
+    for item in p_:
+        pool.apply_async(func=cut_sent, args=(item,))
+    pool.close()
+    pool.join()
+    cost_time = process_time() - st
+    print('DONE!! cost time : %.5f' % cost_time)
diff --git a/fengshen/data/clip_dataloader/flickr.py b/fengshen/data/clip_dataloader/flickr.py
new file mode 100644
index 0000000000000000000000000000000000000000..22155e039f74b49c8a4222a75144a2c134a6d507
--- /dev/null
+++ b/fengshen/data/clip_dataloader/flickr.py
@@ -0,0 +1,105 @@
+from torch.utils.data import Dataset, DataLoader
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
+    CenterCrop
+from transformers import BertTokenizer
+import pytorch_lightning as pl
+from PIL import Image
+import os
+
+
+class flickr30k_CNA(Dataset):
+    def __init__(self, img_root_path,
+                 annot_path,
+                 transform=None):
+        self.images = []
+        self.captions = []
+        self.labels = []
+        self.root = img_root_path
+        with open(annot_path, 'r') as f:
+            for line in f:
+                line = line.strip().split('\t')
+                key, caption = line[0].split('#')[0], line[1]
+                img_path = key + '.jpg'
+                self.images.append(img_path)
+                self.captions.append(caption)
+                self.labels.append(key)
+        self.transforms = transform
+        self.tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
+
+        # NOTE large 模型
+        self.context_length = 77
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        img_path = str(self.images[idx])
+        image = self.transforms(Image.open(os.path.join(self.root, img_path)))
+        text = self.tokenizer(str(self.captions[idx]), max_length=self.context_length,
+                              padding='max_length', truncation=True, return_tensors='pt')['input_ids'][0]
+        label = self.labels[idx]
+        return image, text, label
+
+
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+
+
+def image_transform(
+        image_size: int,
+        is_train: bool,
+        mean=(0.48145466, 0.4578275, 0.40821073),
+        std=(0.26862954, 0.26130258, 0.27577711)
+):
+    normalize = Normalize(mean=mean, std=std)
+    if is_train:
+        return Compose([
+            RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+    else:
+        return Compose([
+            Resize(image_size, interpolation=InterpolationMode.BICUBIC),
+            CenterCrop(image_size),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+
+
+class FlickrDataModule(pl.LightningDataModule):
+    def __init__(self, args):
+        self.batch_size = args.batch_size
+        self.train_filename = args.train_filename  # NOTE 标注的文件夹
+        self.train_root = args.train_root  # NOTE 图片地址
+        self.val_filename = args.val_filename
+        self.val_root = args.val_root
+        self.test_filename = args.test_filename
+        self.test_root = args.test_root
+
+        self.pretrain_model = args.pretrain_model
+        self.image_size = 224
+        self.prepare_data_per_node = True
+        self._log_hyperparams = False
+        self.num_workers = args.num_workers
+
+    def setup(self, stage=None):
+        # dataset
+        train_transform = image_transform(224, True)
+        val_transform = image_transform(224, False)
+        test_transform = image_transform(224, False)
+
+        self.train_dataset = flickr30k_CNA(self.train_root, self.train_filename, transform=train_transform)
+        self.val_dataset = flickr30k_CNA(self.val_root, self.val_filename, transform=val_transform)
+        self.test_dataset = flickr30k_CNA(self.test_root, self.test_filename, transform=test_transform)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
+
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
+
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
diff --git a/fengshen/data/data_utils/common_utils.py b/fengshen/data/data_utils/common_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eef10ecb8c73257ab4338a0ea2e7839b82bcc7e
--- /dev/null
+++ b/fengshen/data/data_utils/common_utils.py
@@ -0,0 +1,4 @@
+def padding_to_maxlength(ids, max_length, pad_id):
+    cur_len = len(ids)
+    len_diff = max_length - len(ids)
+    return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff
diff --git a/fengshen/data/data_utils/mask_utils.py b/fengshen/data/data_utils/mask_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0009f00272bf6feff1dbd491153332584cb431e1
--- /dev/null
+++ b/fengshen/data/data_utils/mask_utils.py
@@ -0,0 +1,285 @@
+import collections
+
+import numpy as np
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def is_start_piece(piece):
+    """Check if the current word piece is the starting piece (BERT)."""
+    # When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    return not piece.startswith("##")
+
+
+def create_masked_lm_predictions(tokens,
+                                 vocab_id_list, vocab_id_to_token_dict,
+                                 masked_lm_prob,
+                                 cls_id, sep_id, mask_id,
+                                 max_predictions_per_seq,
+                                 np_rng,
+                                 max_ngrams=3,
+                                 do_whole_word_mask=True,
+                                 favor_longer_ngram=False,
+                                 do_permutation=False,
+                                 geometric_dist=False,
+                                 masking_style="bert",
+                                 zh_tokenizer=None):
+    """Creates the predictions for the masked LM objective.
+    Note: Tokens here are vocab ids and not text tokens."""
+    '''
+    modified from Megatron-LM
+    Args:
+        tokens: 输入
+        vocab_id_list: 词表token_id_list
+        vocab_id_to_token_dict： token_id到token字典
+        masked_lm_prob：mask概率
+        cls_id、sep_id、mask_id：特殊token
+        max_predictions_per_seq：最大mask个数
+        np_rng：mask随机数
+        max_ngrams：最大词长度
+        do_whole_word_mask：是否做全词掩码
+        favor_longer_ngram：优先用长的词
+        do_permutation：是否打乱
+        geometric_dist：用np_rng.geometric做随机
+        masking_style：mask类型
+        zh_tokenizer：WWM的分词器，比如用jieba.lcut做分词之类的
+    '''
+    cand_indexes = []
+    # Note(mingdachen): We create a list for recording if the piece is
+    # the starting piece of current token, where 1 means true, so that
+    # on-the-fly whole word masking is possible.
+    token_boundary = [0] * len(tokens)
+    # 如果没有指定中文分词器，那就直接按##算
+    if zh_tokenizer is None:
+        for (i, token) in enumerate(tokens):
+            if token == cls_id or token == sep_id:
+                token_boundary[i] = 1
+                continue
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+            if (do_whole_word_mask and len(cand_indexes) >= 1 and
+                    not is_start_piece(vocab_id_to_token_dict[token])):
+                cand_indexes[-1].append(i)
+            else:
+                cand_indexes.append([i])
+                if is_start_piece(vocab_id_to_token_dict[token]):
+                    token_boundary[i] = 1
+    else:
+        # 如果指定了中文分词器，那就先用分词器分词，然后再进行判断
+        # 获取去掉CLS SEP的原始文本
+        raw_tokens = []
+        for t in tokens:
+            if t != cls_id and t != sep_id:
+                raw_tokens.append(t)
+        raw_tokens = [vocab_id_to_token_dict[i] for i in raw_tokens]
+        # 分词然后获取每次字开头的最长词的长度
+        word_list = set(zh_tokenizer(''.join(raw_tokens), HMM=True))
+        word_length_dict = {}
+        for w in word_list:
+            if len(w) < 1:
+                continue
+            if w[0] not in word_length_dict:
+                word_length_dict[w[0]] = len(w)
+            elif word_length_dict[w[0]] < len(w):
+                word_length_dict[w[0]] = len(w)
+        i = 0
+        # 从词表里面检索
+        while i < len(tokens):
+            token_id = tokens[i]
+            token = vocab_id_to_token_dict[token_id]
+            if len(token) == 0 or token_id == cls_id or token_id == sep_id:
+                token_boundary[i] = 1
+                i += 1
+                continue
+            word_max_length = 1
+            if token[0] in word_length_dict:
+                word_max_length = word_length_dict[token[0]]
+            j = 0
+            word = ''
+            word_end = i+1
+            # 兼容以前##的形式，如果后面的词是##开头的，那么直接把后面的拼到前面当作一个词
+            old_style = False
+            while word_end < len(tokens) and vocab_id_to_token_dict[tokens[word_end]].startswith('##'):
+                old_style = True
+                word_end += 1
+            if not old_style:
+                while j < word_max_length and i+j < len(tokens):
+                    cur_token = tokens[i+j]
+                    word += vocab_id_to_token_dict[cur_token]
+                    j += 1
+                    if word in word_list:
+                        word_end = i+j
+            cand_indexes.append([p for p in range(i, word_end)])
+            token_boundary[i] = 1
+            i = word_end
+
+    output_tokens = list(tokens)
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+
+    if masked_lm_prob == 0:
+        return (output_tokens, masked_lm_positions,
+                masked_lm_labels, token_boundary)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
+    if not geometric_dist:
+        # Note(mingdachen):
+        # By default, we set the probilities to favor shorter ngram sequences.
+        pvals = 1. / np.arange(1, max_ngrams + 1)
+        pvals /= pvals.sum(keepdims=True)
+        if favor_longer_ngram:
+            pvals = pvals[::-1]
+    # 获取一个ngram的idx，对于每个word，记录他的ngram的word
+    ngram_indexes = []
+    for idx in range(len(cand_indexes)):
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+        ngram_indexes.append(ngram_index)
+
+    np_rng.shuffle(ngram_indexes)
+
+    (masked_lms, masked_spans) = ([], [])
+    covered_indexes = set()
+    for cand_index_set in ngram_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if not cand_index_set:
+            continue
+        # Note(mingdachen):
+        # Skip current piece if they are covered in lm masking or previous ngrams.
+        for index_set in cand_index_set[0]:
+            for index in index_set:
+                if index in covered_indexes:
+                    continue
+
+        if not geometric_dist:
+            n = np_rng.choice(ngrams[:len(cand_index_set)],
+                              p=pvals[:len(cand_index_set)] /
+                              pvals[:len(cand_index_set)].sum(keepdims=True))
+        else:
+            # Sampling "n" from the geometric distribution and clipping it to
+            # the max_ngrams. Using p=0.2 default from the SpanBERT paper
+            # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
+            n = min(np_rng.geometric(0.2), max_ngrams)
+
+        index_set = sum(cand_index_set[n - 1], [])
+        n -= 1
+        # Note(mingdachen):
+        # Repeatedly looking for a candidate that does not exceed the
+        # maximum number of predictions by trying shorter ngrams.
+        while len(masked_lms) + len(index_set) > num_to_predict:
+            if n == 0:
+                break
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_predict:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+            masked_token = None
+            token_id = tokens[index]
+            if masking_style == "bert":
+                # 80% of the time, replace with [MASK]
+                if np_rng.random() < 0.8:
+                    masked_token = mask_id
+                else:
+                    # 10% of the time, keep original
+                    if np_rng.random() < 0.5:
+                        masked_token = tokens[index]
+                    # 10% of the time, replace with random word
+                    else:
+                        masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
+            elif masking_style == "t5":
+                masked_token = mask_id
+            else:
+                raise ValueError("invalid value of masking style")
+
+            output_tokens[index] = masked_token
+            masked_lms.append(MaskedLmInstance(index=index, label=token_id))
+
+        masked_spans.append(MaskedLmInstance(
+            index=index_set,
+            label=[tokens[index] for index in index_set]))
+
+    assert len(masked_lms) <= num_to_predict
+    np_rng.shuffle(ngram_indexes)
+
+    select_indexes = set()
+    if do_permutation:
+        for cand_index_set in ngram_indexes:
+            if len(select_indexes) >= num_to_predict:
+                break
+            if not cand_index_set:
+                continue
+            # Note(mingdachen):
+            # Skip current piece if they are covered in lm masking or previous ngrams.
+            for index_set in cand_index_set[0]:
+                for index in index_set:
+                    if index in covered_indexes or index in select_indexes:
+                        continue
+
+            n = np.random.choice(ngrams[:len(cand_index_set)],
+                                 p=pvals[:len(cand_index_set)] /
+                                 pvals[:len(cand_index_set)].sum(keepdims=True))
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+
+            while len(select_indexes) + len(index_set) > num_to_predict:
+                if n == 0:
+                    break
+                index_set = sum(cand_index_set[n - 1], [])
+                n -= 1
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(select_indexes) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                if index in covered_indexes or index in select_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                select_indexes.add(index)
+        assert len(select_indexes) <= num_to_predict
+
+        select_indexes = sorted(select_indexes)
+        permute_indexes = list(select_indexes)
+        np_rng.shuffle(permute_indexes)
+        orig_token = list(output_tokens)
+
+        for src_i, tgt_i in zip(select_indexes, permute_indexes):
+            output_tokens[src_i] = orig_token[tgt_i]
+            masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    # Sort the spans by the index of the first span
+    masked_spans = sorted(masked_spans, key=lambda x: x.index[0])
+
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+    return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans)
diff --git a/fengshen/data/data_utils/sentence_split.py b/fengshen/data/data_utils/sentence_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a25e4b51b13f86f4a8a6b39f497f85c050856b6
--- /dev/null
+++ b/fengshen/data/data_utils/sentence_split.py
@@ -0,0 +1,35 @@
+import re
+
+
+class ChineseSentenceSplitter(object):
+    def merge_symmetry(self, sentences, symmetry=('“', '”')):
+        # '''合并对称符号，如双引号'''
+        effective_ = []
+        merged = True
+        for index in range(len(sentences)):
+            if symmetry[0] in sentences[index] and symmetry[1] not in sentences[index]:
+                merged = False
+                effective_.append(sentences[index])
+            elif symmetry[1] in sentences[index] and not merged:
+                merged = True
+                effective_[-1] += sentences[index]
+            elif symmetry[0] not in sentences[index] and symmetry[1] not in sentences[index] and not merged:
+                effective_[-1] += sentences[index]
+            else:
+                effective_.append(sentences[index])
+        return [i.strip() for i in effective_ if len(i.strip()) > 0]
+
+    def to_sentences(self, paragraph):
+        #  """由段落切分成句子"""
+        sentences = re.split(r"(？|。|[！]+|!|\…\…)", paragraph)
+        sentences.append("")
+        sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])]
+        sentences = [i.strip() for i in sentences if len(i.strip()) > 0]
+        for j in range(1, len(sentences)):
+            if sentences[j][0] == '”':
+                sentences[j-1] = sentences[j-1] + '”'
+                sentences[j] = sentences[j][1:]
+        return self.merge_symmetry(sentences)
+
+    def tokenize(self, text):
+        return self.to_sentences(text)
diff --git a/fengshen/data/data_utils/sop_utils.py b/fengshen/data/data_utils/sop_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..505f14dca99638b10eee0a4017447401a71ef083
--- /dev/null
+++ b/fengshen/data/data_utils/sop_utils.py
@@ -0,0 +1,32 @@
+
+# copy from megatron
+def get_a_and_b_segments(sample, np_rng):
+    """Divide sample into a and b segments."""
+
+    # Number of sentences in the sample.
+    n_sentences = len(sample)
+    # Make sure we always have two sentences.
+    assert n_sentences > 1, 'make sure each sample has at least two sentences.'
+
+    # First part:
+    # `a_end` is how many sentences go into the `A`.
+    a_end = 1
+    if n_sentences >= 3:
+        # Note that randin in numpy is exclusive.
+        a_end = np_rng.randint(1, n_sentences)
+    tokens_a = []
+    for j in range(a_end):
+        tokens_a.extend(sample[j])
+
+    # Second part:
+    tokens_b = []
+    for j in range(a_end, n_sentences):
+        tokens_b.extend(sample[j])
+
+    # Random next:
+    is_next_random = False
+    if np_rng.random() < 0.5:
+        is_next_random = True
+        tokens_a, tokens_b = tokens_b, tokens_a
+
+    return tokens_a, tokens_b, is_next_random
diff --git a/fengshen/data/data_utils/token_type_utils.py b/fengshen/data/data_utils/token_type_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b805d23b9aa4cda495d3b76ecba7effdc2854eb
--- /dev/null
+++ b/fengshen/data/data_utils/token_type_utils.py
@@ -0,0 +1,25 @@
+def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
+    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
+
+    tokens = []
+    tokentypes = []
+    # [CLS].
+    tokens.append(cls_id)
+    tokentypes.append(0)
+    # Segment A.
+    for token in tokens_a:
+        tokens.append(token)
+        tokentypes.append(0)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(0)
+    # Segment B.
+    for token in tokens_b:
+        tokens.append(token)
+        tokentypes.append(1)
+    if tokens_b:
+        # [SEP].
+        tokens.append(sep_id)
+        tokentypes.append(1)
+
+    return tokens, tokentypes
diff --git a/fengshen/data/data_utils/truncate_utils.py b/fengshen/data/data_utils/truncate_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba4c6b653762c01a26da1bea9cb3d3cbeec08fd7
--- /dev/null
+++ b/fengshen/data/data_utils/truncate_utils.py
@@ -0,0 +1,19 @@
+
+def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    # print(len_a, len_b, max_num_tokens)
+    assert len_a > 0
+    if len_a + len_b <= max_num_tokens:
+        return False
+    while len_a + len_b > max_num_tokens:
+        if len_a > len_b:
+            len_a -= 1
+            tokens = tokens_a
+        else:
+            len_b -= 1
+            tokens = tokens_b
+        if np_rng.random() < 0.5:
+            del tokens[0]
+        else:
+            tokens.pop()
+    return True
diff --git a/fengshen/data/dreambooth_datasets/dreambooth_datasets.py b/fengshen/data/dreambooth_datasets/dreambooth_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f94216f3dadbd5423dfdb53fe1b2ff9382fb4d5
--- /dev/null
+++ b/fengshen/data/dreambooth_datasets/dreambooth_datasets.py
@@ -0,0 +1,183 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   dreambooth_datasets.py
+@Time    :   2022/11/10 00:20
+@Author  :   Gan Ruyi
+@Version :   1.0
+@Contact :   ganruyi@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+from torch.utils.data import Dataset
+from torchvision import transforms
+from PIL import Image
+from pathlib import Path
+
+
+def add_data_args(parent_args):
+    parser = parent_args.add_argument_group('taiyi stable diffusion data args')
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--resolution", type=int, default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", default=False,
+        help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    return parent_args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_dir,
+        instance_prompt,
+        tokenizer,
+        class_data_dir=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_dir = Path(instance_data_dir)
+        if not self.instance_data_dir.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_dir).iterdir())
+        print(self.instance_images_path)
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_dir is not None:
+            self.class_data_dir = Path(class_data_dir)
+            self.class_data_dir.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_dir.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_dir = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=64,
+            # max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_dir:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                # max_length=self.tokenizer.model_max_length,
+                max_length=64,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
diff --git a/fengshen/data/hubert/hubert_dataset.py b/fengshen/data/hubert/hubert_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8eaa25a5238740cc86a05af257aa3e0996f1499
--- /dev/null
+++ b/fengshen/data/hubert/hubert_dataset.py
@@ -0,0 +1,361 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import os
+import sys
+from typing import Any, List, Optional, Union
+
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+from fairseq.data import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+
+logger = logging.getLogger(__name__)
+
+
+def add_data_specific_args(parent_args):
+    parser = parent_args.add_argument_group('Hubert Dataset')
+    parser.add_argument('--data', type=str)
+    parser.add_argument('--sample_rate', type=float, default=16000)
+    parser.add_argument('--label_dir', type=str)
+    parser.add_argument('--labels', type=str, nargs='+')
+    parser.add_argument('--label_rate', type=float)
+    parser.add_argument('--max_keep_size', type=int, default=None)
+    parser.add_argument('--min_sample_size', type=int)
+    parser.add_argument('--max_sample_size', type=int)
+    parser.add_argument('--pad_audio', type=bool)
+    parser.add_argument('--normalize', type=bool)
+    parser.add_argument('--random_crop', type=bool)
+    parser.add_argument('--single_target', type=bool, default=False)
+    return parent_args
+
+
+def load_audio(manifest_path, max_keep, min_keep):
+    n_long, n_short = 0, 0
+    names, inds, sizes = [], [], []
+    with open(manifest_path) as f:
+        root = f.readline().strip()
+        for ind, line in enumerate(f):
+            items = line.strip().split("\t")
+            assert len(items) == 2, line
+            sz = int(items[1])
+            if min_keep is not None and sz < min_keep:
+                n_short += 1
+            elif max_keep is not None and sz > max_keep:
+                n_long += 1
+            else:
+                names.append(items[0])
+                inds.append(ind)
+                sizes.append(sz)
+    tot = ind + 1
+    logger.info(
+        (
+            f"max_keep={max_keep}, min_keep={min_keep}, "
+            f"loaded {len(names)}, skipped {n_short} short and {n_long} long, "
+            f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}"
+        )
+    )
+    return root, names, inds, tot, sizes
+
+
+def load_label(label_path, inds, tot):
+    with open(label_path) as f:
+        labels = [line.rstrip() for line in f]
+        assert (
+            len(labels) == tot
+        ), f"number of labels does not match ({len(labels)} != {tot})"
+        labels = [labels[i] for i in inds]
+    return labels
+
+
+def load_label_offset(label_path, inds, tot):
+    with open(label_path) as f:
+        code_lengths = [len(line.encode("utf-8")) for line in f]
+        assert (
+            len(code_lengths) == tot
+        ), f"number of labels does not match ({len(code_lengths)} != {tot})"
+        offsets = list(itertools.accumulate([0] + code_lengths))
+        offsets = [(offsets[i], offsets[i + 1]) for i in inds]
+    return offsets
+
+
+def verify_label_lengths(
+    audio_sizes,
+    audio_rate,
+    label_path,
+    label_rate,
+    inds,
+    tot,
+    tol=0.1,  # tolerance in seconds
+):
+    if label_rate < 0:
+        logger.info(f"{label_path} is sequence label. skipped")
+        return
+
+    with open(label_path) as f:
+        lengths = [len(line.rstrip().split()) for line in f]
+        assert len(lengths) == tot
+        lengths = [lengths[i] for i in inds]
+    num_invalid = 0
+    for i, ind in enumerate(inds):
+        dur_from_audio = audio_sizes[i] / audio_rate
+        dur_from_label = lengths[i] / label_rate
+        if abs(dur_from_audio - dur_from_label) > tol:
+            logger.warning(
+                (
+                    f"audio and label duration differ too much "
+                    f"(|{dur_from_audio} - {dur_from_label}| > {tol}) "
+                    f"in line {ind+1} of {label_path}. Check if `label_rate` "
+                    f"is correctly set (currently {label_rate}). "
+                    f"num. of samples = {audio_sizes[i]}; "
+                    f"label length = {lengths[i]}"
+                )
+            )
+            num_invalid += 1
+    if num_invalid > 0:
+        logger.warning(
+            f"total {num_invalid} (audio, label) pairs with mismatched lengths"
+        )
+
+
+class HubertDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest_path: str,
+        sample_rate: float,
+        label_paths: List[str],
+        label_rates: Union[List[float], float],  # -1 for sequence labels
+        pad_list: List[str],
+        eos_list: List[str],
+        label_processors: Optional[List[Any]] = None,
+        max_keep_sample_size: Optional[int] = None,
+        min_keep_sample_size: Optional[int] = None,
+        max_sample_size: Optional[int] = None,
+        shuffle: bool = True,
+        pad_audio: bool = False,
+        normalize: bool = False,
+        store_labels: bool = True,
+        random_crop: bool = False,
+        single_target: bool = False,
+    ):
+        self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio(
+            manifest_path, max_keep_sample_size, min_keep_sample_size
+        )
+        self.sample_rate = sample_rate
+        self.shuffle = shuffle
+        self.random_crop = random_crop
+
+        self.num_labels = len(label_paths)
+        self.pad_list = pad_list
+        self.eos_list = eos_list
+        self.label_processors = label_processors
+        self.single_target = single_target
+        self.label_rates = (
+            [label_rates for _ in range(len(label_paths))]
+            if isinstance(label_rates, float)
+            else label_rates
+        )
+        self.store_labels = store_labels
+        if store_labels:
+            self.label_list = [load_label(p, inds, tot) for p in label_paths]
+        else:
+            self.label_paths = label_paths
+            self.label_offsets_list = [
+                load_label_offset(p, inds, tot) for p in label_paths
+            ]
+        assert label_processors is None or len(label_processors) == self.num_labels
+        for label_path, label_rate in zip(label_paths, self.label_rates):
+            verify_label_lengths(
+                self.sizes, sample_rate, label_path, label_rate, inds, tot
+            )
+
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.pad_audio = pad_audio
+        self.normalize = normalize
+        logger.info(
+            f"pad_audio={pad_audio}, random_crop={random_crop}, "
+            f"normalize={normalize}, max_sample_size={self.max_sample_size}"
+        )
+
+    def get_audio(self, index):
+        import soundfile as sf
+
+        wav_path = os.path.join(self.audio_root, self.audio_names[index])
+        wav, cur_sample_rate = sf.read(wav_path)
+        wav = torch.from_numpy(wav).float()
+        wav = self.postprocess(wav, cur_sample_rate)
+        return wav
+
+    def get_label(self, index, label_idx):
+        if self.store_labels:
+            label = self.label_list[label_idx][index]
+        else:
+            with open(self.label_paths[label_idx]) as f:
+                offset_s, offset_e = self.label_offsets_list[label_idx][index]
+                f.seek(offset_s)
+                label = f.read(offset_e - offset_s)
+
+        if self.label_processors is not None:
+            label = self.label_processors[label_idx](label)
+        return label
+
+    def get_labels(self, index):
+        return [self.get_label(index, i) for i in range(self.num_labels)]
+
+    def __getitem__(self, index):
+        wav = self.get_audio(index)
+        labels = self.get_labels(index)
+        return {"id": index, "source": wav, "label_list": labels}
+
+    def __len__(self):
+        return len(self.sizes)
+
+    def crop_to_max_size(self, wav, target_size):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav, 0
+
+        start, end = 0, target_size
+        if self.random_crop:
+            start = np.random.randint(0, diff + 1)
+            end = size - diff + start
+        return wav[start:end], start
+
+    def collater(self, samples):
+        # target = max(sizes) -> random_crop not used
+        # target = max_sample_size -> random_crop used for long
+        samples = [s for s in samples if s["source"] is not None]
+        if len(samples) == 0:
+            return {}
+
+        audios = [s["source"] for s in samples]
+        audio_sizes = [len(s) for s in audios]
+        if self.pad_audio:
+            audio_size = min(max(audio_sizes), self.max_sample_size)
+        else:
+            audio_size = min(min(audio_sizes), self.max_sample_size)
+        collated_audios, padding_mask, audio_starts = self.collater_audio(
+            audios, audio_size
+        )
+
+        targets_by_label = [
+            [s["label_list"][i] for s in samples] for i in range(self.num_labels)
+        ]
+        targets_list, lengths_list, ntokens_list = self.collater_label(
+            targets_by_label, audio_size, audio_starts
+        )
+
+        net_input = {"source": collated_audios, "padding_mask": padding_mask}
+        batch = {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "net_input": net_input,
+        }
+
+        if self.single_target:
+            batch["target_lengths"] = lengths_list[0]
+            batch["ntokens"] = ntokens_list[0]
+            batch["target"] = targets_list[0]
+        else:
+            batch["target_lengths_list"] = lengths_list
+            batch["ntokens_list"] = ntokens_list
+            batch["target_list"] = targets_list
+        return batch
+
+    def collater_audio(self, audios, audio_size):
+        collated_audios = audios[0].new_zeros(len(audios), audio_size)
+        padding_mask = (
+            torch.BoolTensor(collated_audios.shape).fill_(False)
+            # if self.pad_audio else None
+        )
+        audio_starts = [0 for _ in audios]
+        for i, audio in enumerate(audios):
+            diff = len(audio) - audio_size
+            if diff == 0:
+                collated_audios[i] = audio
+            elif diff < 0:
+                assert self.pad_audio
+                collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)])
+                padding_mask[i, diff:] = True
+            else:
+                collated_audios[i], audio_starts[i] = self.crop_to_max_size(
+                    audio, audio_size
+                )
+        return collated_audios, padding_mask, audio_starts
+
+    def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
+        assert label_rate > 0
+        s2f = label_rate / self.sample_rate
+        frm_starts = [int(round(s * s2f)) for s in audio_starts]
+        frm_size = int(round(audio_size * s2f))
+        if not self.pad_audio:
+            rem_size = [len(t) - s for t, s in zip(targets, frm_starts)]
+            frm_size = min(frm_size, *rem_size)
+        targets = [t[s: s + frm_size] for t, s in zip(targets, frm_starts)]
+        logger.debug(f"audio_starts={audio_starts}")
+        logger.debug(f"frame_starts={frm_starts}")
+        logger.debug(f"frame_size={frm_size}")
+
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False)
+        return targets, lengths, ntokens
+
+    def collater_seq_label(self, targets, pad):
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False)
+        return targets, lengths, ntokens
+
+    def collater_label(self, targets_by_label, audio_size, audio_starts):
+        targets_list, lengths_list, ntokens_list = [], [], []
+        itr = zip(targets_by_label, self.label_rates, self.pad_list)
+        for targets, label_rate, pad in itr:
+            if label_rate == -1.0:
+                targets, lengths, ntokens = self.collater_seq_label(targets, pad)
+            else:
+                targets, lengths, ntokens = self.collater_frm_label(
+                    targets, audio_size, audio_starts, label_rate, pad
+                )
+            targets_list.append(targets)
+            lengths_list.append(lengths)
+            ntokens_list.append(ntokens)
+        return targets_list, lengths_list, ntokens_list
+
+    def num_tokens(self, index):
+        return self.size(index)
+
+    def size(self, index):
+        if self.pad_audio:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+
+        order.append(self.sizes)
+        return np.lexsort(order)[::-1]
+
+    def postprocess(self, wav, cur_sample_rate):
+        if wav.dim() == 2:
+            wav = wav.mean(-1)
+        assert wav.dim() == 1, wav.dim()
+
+        if cur_sample_rate != self.sample_rate:
+            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
+
+        if self.normalize:
+            with torch.no_grad():
+                wav = F.layer_norm(wav, wav.shape)
+        return wav
diff --git a/fengshen/data/megatron_dataloader/Makefile b/fengshen/data/megatron_dataloader/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8f9db7686696fbea6c94b998db4b40ef426c748d
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/Makefile
@@ -0,0 +1,9 @@
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/fengshen/data/megatron_dataloader/__init__.py b/fengshen/data/megatron_dataloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd5f898c6bdf89c6cf0243af102d04f6efed86b8
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/__init__.py
@@ -0,0 +1 @@
+from . import indexed_dataset
diff --git a/fengshen/data/megatron_dataloader/bart_dataset.py b/fengshen/data/megatron_dataloader/bart_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..82a22aa21eba9ac4794305c72efe3c25e2bdefb7
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/bart_dataset.py
@@ -0,0 +1,443 @@
+"""BART Style dataset. Modified from fairseq."""
+
+import numpy as np
+import torch
+import math
+import re
+
+from fengshen.data.megatron_dataloader.dataset_utils import (
+    get_samples_mapping
+)
+
+
+class BartDataset(torch.utils.data.Dataset):
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, masked_lm_prob,
+                 max_seq_length, short_seq_prob, seed, tokenizer, zh_tokenizer):
+
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   self.max_seq_length - 3,  # account for added tokens
+                                                   short_seq_prob,
+                                                   self.seed,
+                                                   self.name,
+                                                   False)
+
+        # Vocab stuff.
+        self.vocab_size = tokenizer.vocab_size
+        inv_vocab = {v: k for k, v in tokenizer.vocab.items()}
+        self.vocab_id_list = list(inv_vocab.keys())
+        self.vocab_id_to_token_dict = inv_vocab
+        self.cls_id = tokenizer.cls_token_id
+        self.sep_id = tokenizer.sep_token_id
+        self.mask_id = tokenizer.mask_token_id
+        self.pad_id = tokenizer.pad_token_id
+        self.tokenizer = tokenizer
+
+        seg_tokens = ['。', ';', '；', '!', '！', '?', '？']
+        seg_token_ids = []
+        for t in seg_tokens:
+            if t in tokenizer.vocab:
+                seg_token_ids.append(tokenizer.vocab[t])
+            else:
+                print('seg_token "{}" not in vocab'.format(t))
+        self.seg_token_ids = set(seg_token_ids)
+
+        self.zh_tokenizer = zh_tokenizer
+
+        # Denoising ratios
+        self.permute_sentence_ratio = 1.0
+        self.mask_ratio = masked_lm_prob  # 0.15
+        self.random_ratio = 0.1
+        self.insert_ratio = 0.0
+        self.rotate_ratio = 0.0
+        self.mask_whole_word = 1
+        self.item_transform_func = None
+
+        self.mask_span_distribution = None
+        if False:
+            _lambda = 3  # Poisson lambda
+
+            lambda_to_the_k = 1
+            e_to_the_minus_lambda = math.exp(-_lambda)
+            k_factorial = 1
+            ps = []
+            for k in range(0, 128):
+                ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
+                lambda_to_the_k *= _lambda
+                k_factorial *= k + 1
+                if ps[-1] < 0.0000001:
+                    break
+            ps = torch.FloatTensor(ps)
+            self.mask_span_distribution = torch.distributions.Categorical(ps)
+
+    def __len__(self):
+        return self.samples_mapping.shape[0]
+
+    def __getitem__(self, idx):
+        start_idx, end_idx, seq_length = self.samples_mapping[idx]
+        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
+        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
+        return self.build_training_sample(sample, self.max_seq_length, np_rng)
+
+    def build_training_sample(self, sample, max_seq_length, np_rng):
+        """Biuld training sample.
+
+        Arguments:
+            sample: A list of sentences in which each sentence is a list token ids.
+            max_seq_length: Desired sequence length.
+            np_rng: Random number genenrator. Note that this rng state should be
+                numpy and not python since python randint is inclusive for
+                the opper bound whereas the numpy one is exclusive.
+        """
+        # permute sentences
+        full_stops = []
+        tokens = [self.cls_id]
+        for sent in sample:
+            for t in sent:
+                token = self.vocab_id_to_token_dict[t]
+                if len(re.findall('##[\u4E00-\u9FA5]', token)) > 0:
+                    # 兼容erlangshen ##的方式做whole word mask
+                    t = self.tokenizer.convert_tokens_to_ids(token[2:])
+                tokens.append(t)
+                if t in self.seg_token_ids:
+                    tokens.append(self.sep_id)
+            if tokens[-1] != self.sep_id:
+                tokens.append(self.sep_id)
+
+        if len(tokens) > max_seq_length:
+            tokens = tokens[:max_seq_length]
+            tokens[-1] = self.sep_id
+        tokens = torch.LongTensor(tokens)
+        full_stops = (tokens == self.sep_id).long()
+        assert (max_seq_length - tokens.shape[0]) >= 0, (tokens.size(), tokens[-1], max_seq_length)
+
+        source, target = tokens, tokens[1:].clone()
+        use_decoder = 1
+        # if torch.rand(1).item() < 0.5:
+        #     use_decoder = 0
+
+        if self.permute_sentence_ratio > 0.0 and use_decoder == 1:
+            source = self.permute_sentences(source, full_stops, self.permute_sentence_ratio)
+
+        if self.mask_ratio > 0.0:
+            replace_length = 1 if use_decoder else -1
+            mask_ratio = self.mask_ratio * 2 if use_decoder else self.mask_ratio
+            source = self.add_whole_word_mask(source, mask_ratio, replace_length)
+
+        if self.insert_ratio > 0.0:
+            raise NotImplementedError
+            source = self.add_insertion_noise(source, self.insert_ratio)
+
+        if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio:
+            raise NotImplementedError
+            source = self.add_rolling_noise(source)
+
+        # there can additional changes to make:
+        if self.item_transform_func is not None:
+            source, target = self.item_transform_func(source, target)
+
+        assert (source >= 0).all()
+        # assert (source[1:-1] >= 1).all()
+        assert (source <= self.vocab_size).all()
+        assert source[0] == self.cls_id
+        assert source[-1] == self.sep_id
+
+        # tokenizer = get_tokenizer()
+        # print(' '.join(tokenizer.tokenizer.convert_ids_to_tokens(source)))
+        # print(tokenizer.detokenize(target))
+        # print(tokenizer.detokenize(source))
+        # print()
+
+        prev_output_tokens = torch.zeros_like(target)
+        prev_output_tokens[0] = self.sep_id  # match the preprocessing in fairseq
+        prev_output_tokens[1:] = target[:-1]
+
+        # src_padding_length = max_seq_length - source.shape[0]
+        # tgt_padding_length = max_seq_length - target.shape[0]
+        # assert src_padding_length >= 0, (source.size(), source[-1], max_seq_length)
+        # assert tgt_padding_length >= 0, (target.size(), target[-1], max_seq_length)
+        source_ = torch.full((max_seq_length,), self.pad_id, dtype=torch.long)
+        source_[:source.shape[0]] = source
+        target_ = torch.full((max_seq_length,), -100, dtype=torch.long)
+        # decoder not need bos in the front
+        target_[:target.shape[0]] = target
+        prev_output_tokens_ = torch.full((max_seq_length,), self.pad_id, dtype=torch.long)
+        prev_output_tokens_[:prev_output_tokens.shape[0]] = prev_output_tokens
+
+        return {
+            "input_ids": source_,
+            "labels": target_,
+            # "decoder_input_ids": prev_output_tokens_,
+            "attention_mask": (source_ != self.pad_id).long()
+        }
+
+    def permute_sentences(self, source, full_stops, p=1.0):
+        # Tokens that are full stops, where the previous token is not
+        sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2
+        result = source.clone()
+
+        num_sentences = sentence_ends.size(0)
+        num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
+        substitutions = torch.randperm(num_sentences)[:num_to_permute]
+        ordering = torch.arange(0, num_sentences)
+        ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]
+
+        # Ignore <bos> at start
+        index = 1
+        for i in ordering:
+            sentence = source[(sentence_ends[i - 1] if i > 0 else 1): sentence_ends[i]]
+            result[index: index + sentence.size(0)] = sentence
+            index += sentence.size(0)
+        return result
+
+    def word_starts_en(self, source):
+        if self.mask_whole_word is not None:
+            is_word_start = self.mask_whole_word.gather(0, source)
+        else:
+            is_word_start = torch.ones(source.size())
+        is_word_start[0] = 0
+        is_word_start[-1] = 0
+        return is_word_start
+
+    def word_starts(self, source):
+        if self.mask_whole_word is None:
+            is_word_start = torch.ones(source.size())
+            is_word_start[0] = 0
+            is_word_start[-1] = 0
+            return is_word_start
+        raw_tokens = [self.vocab_id_to_token_dict[i] for i in source.tolist()]
+        words = [raw_tokens[0]] + \
+            self.zh_tokenizer(''.join(raw_tokens[1:-1]), HMM=True) + [raw_tokens[-1]]
+
+        def _is_chinese_char(c):
+            """Checks whether CP is the #codepoint of a CJK character."""
+            # This defines a "chinese character" as anything in the CJK Unicode block:
+            #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+            #
+            # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+            # despite its name. The modern Korean Hangul alphabet is a different block,
+            # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+            # space-separated words, so they are not treated specially and handled
+            # like the all of the other languages.
+            if len(c) > 1:
+                return all([_is_chinese_char(c_i) for c_i in c])
+            cp = ord(c)
+            if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                    (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+                return True
+
+            return False
+
+        def align_linear(atokens, btokens):
+            a2c = []
+            c2b = []
+            a2b = []
+            length = 0
+            for tok in atokens:
+                a2c.append([length + i for i in range(len(tok))])
+                length += len(tok)
+            for i, tok in enumerate(btokens):
+                c2b.extend([i for _ in range(len(tok))])
+
+            for i, amap in enumerate(a2c):
+                bmap = [c2b[ci] for ci in amap]
+                a2b.append(list(set(bmap)))
+            return a2b
+
+        raw_to_word_align = align_linear(raw_tokens, words)
+        is_word_start = torch.zeros(source.size())
+        word_starts = []
+        skip_cur_word = True
+        for i in range(1, len(raw_to_word_align)):
+            if raw_to_word_align[i-1] == raw_to_word_align[i]:
+                # not a word start, as they align to the same word
+                if not skip_cur_word and not _is_chinese_char(raw_tokens[i]):
+                    word_starts.pop(-1)
+                    skip_cur_word = True
+                continue
+            else:
+                is_word_start[i] = 1
+                if _is_chinese_char(raw_tokens[i]):
+                    word_starts.append(i)
+                    skip_cur_word = False
+        is_word_start[0] = 0
+        is_word_start[-1] = 0
+        word_starts = torch.tensor(word_starts).long().view(-1, 1)
+        return is_word_start, word_starts
+
+    def add_whole_word_mask(self, source, p, replace_length=1):
+        is_word_start, word_starts = self.word_starts(source)
+        num_to_mask_word = int(math.ceil(word_starts.size(0) * p))
+        num_to_mask_char = int(math.ceil(word_starts.size(0) * p * 0.1))
+        num_to_mask = num_to_mask_word + num_to_mask_char
+        if num_to_mask > word_starts.size(0):
+            word_starts = is_word_start.nonzero(as_tuple=False)
+        num_inserts = 0
+        if num_to_mask == 0:
+            return source
+
+        if self.mask_span_distribution is not None:
+            lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,))
+
+            # Make sure we have enough to mask
+            cum_length = torch.cumsum(lengths, 0)
+            while cum_length[-1] < num_to_mask:
+                lengths = torch.cat(
+                    [
+                        lengths,
+                        self.mask_span_distribution.sample(sample_shape=(num_to_mask,)),
+                    ],
+                    dim=0,
+                )
+                cum_length = torch.cumsum(lengths, 0)
+
+            # Trim to masking budget
+            i = 0
+            while cum_length[i] < num_to_mask:
+                i += 1
+            lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
+            num_to_mask = i + 1
+            lengths = lengths[:num_to_mask]
+
+            # Handle 0-length mask (inserts) separately
+            lengths = lengths[lengths > 0]
+            num_inserts = num_to_mask - lengths.size(0)
+            num_to_mask -= num_inserts
+            if num_to_mask == 0:
+                return self.add_insertion_noise(source, num_inserts / source.size(0))
+
+            assert (lengths > 0).all()
+        else:
+            lengths = torch.ones((num_to_mask,)).long()
+        assert is_word_start[-1] == 0
+        indices = word_starts[
+            torch.randperm(word_starts.size(0))[:num_to_mask]
+        ].squeeze(1)
+        mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio
+        source_length = source.size(0)
+        assert source_length - 1 not in indices
+        to_keep = torch.ones(source_length, dtype=torch.bool)
+        is_word_start[
+            -1
+        ] = 255  # acts as a long length, so spans don't go over the end of doc
+        if replace_length == 0:
+            to_keep[indices] = 0
+        else:
+            # keep index, but replace it with [MASK]
+            # print(source.size(), word_starts.size(), indices.size(), mask_random.size())
+            source[indices] = self.mask_id
+            source[indices[mask_random]] = torch.randint(
+                1, self.vocab_size, size=(mask_random.sum(),)
+            )
+            # sorted_indices = torch.sort(indices)[0]
+            # continue_mask_pos = ((sorted_indices + 1)[:-1] == sorted_indices[1:])
+            # continue_mask_indices = sorted_indices[1:][continue_mask_pos]
+            # to_keep[continue_mask_indices] = 0
+
+        # for char indices, we already masked, the following loop handles word mask
+        indices = indices[:num_to_mask_word]
+        mask_random = mask_random[:num_to_mask_word]
+        if self.mask_span_distribution is not None:
+            assert len(lengths.size()) == 1
+            assert lengths.size() == indices.size()
+            lengths -= 1
+            while indices.size(0) > 0:
+                assert lengths.size() == indices.size()
+                lengths -= is_word_start[indices + 1].long()
+                uncompleted = lengths >= 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                lengths = lengths[uncompleted]
+                if replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_id
+                    source[indices[mask_random]] = torch.randint(
+                        1, self.vocab_size, size=(mask_random.sum(),)
+                    )
+        else:
+            # A bit faster when all lengths are 1
+            while indices.size(0) > 0:
+                uncompleted = is_word_start[indices + 1] == 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                if replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_id
+                    source[indices[mask_random]] = torch.randint(
+                        1, self.vocab_size, size=(mask_random.sum(),)
+                    )
+
+                assert source_length - 1 not in indices
+
+        source = source[to_keep]
+
+        if num_inserts > 0:
+            source = self.add_insertion_noise(source, num_inserts / source.size(0))
+
+        return source
+
+    def add_permuted_noise(self, tokens, p):
+        num_words = len(tokens)
+        num_to_permute = math.ceil(((num_words * 2) * p) / 2.0)
+        substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1
+        tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]]
+        return tokens
+
+    def add_rolling_noise(self, tokens):
+        offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1)
+        tokens = torch.cat(
+            (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]),
+            dim=0,
+        )
+        return tokens
+
+    def add_insertion_noise(self, tokens, p):
+        if p == 0.0:
+            return tokens
+
+        num_tokens = len(tokens)
+        n = int(math.ceil(num_tokens * p))
+
+        noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1
+        noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool)
+        noise_mask[noise_indices] = 1
+        result = torch.LongTensor(n + len(tokens)).fill_(-1)
+
+        num_random = int(math.ceil(n * self.random_ratio))
+        result[noise_indices[num_random:]] = self.mask_id
+        result[noise_indices[:num_random]] = torch.randint(
+            low=1, high=self.vocab_size, size=(num_random,)
+        )
+
+        result[~noise_mask] = tokens
+
+        assert (result >= 0).all()
+        return result
diff --git a/fengshen/data/megatron_dataloader/bert_dataset.py b/fengshen/data/megatron_dataloader/bert_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c007f060fd07fc9c6302b7f88e191469d599222
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/bert_dataset.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT Style dataset."""
+
+
+import numpy as np
+import torch
+
+from fengshen.data.megatron_dataloader.dataset_utils import (
+    get_samples_mapping,
+    get_a_and_b_segments,
+    create_masked_lm_predictions,
+    create_tokens_and_tokentypes,
+)
+
+
+class BertDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, masked_lm_prob,
+                 max_seq_length, short_seq_prob, seed, binary_head, tokenizer, masking_style):
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+        self.short_seq_prob = short_seq_prob
+        self.binary_head = binary_head
+        self.masking_style = masking_style
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   # account for added tokens
+                                                   self.max_seq_length - 3,
+                                                   short_seq_prob,
+                                                   self.seed,
+                                                   self.name,
+                                                   self.binary_head)
+        inv_vocab = {v: k for k, v in tokenizer.vocab.items()}
+        self.vocab_id_list = list(inv_vocab.keys())
+        self.vocab_id_to_token_dict = inv_vocab
+        self.cls_id = tokenizer.cls_token_id
+        self.sep_id = tokenizer.sep_token_id
+        self.mask_id = tokenizer.mask_token_id
+        self.pad_id = tokenizer.pad_token_id
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return self.samples_mapping.shape[0]
+
+    def __getitem__(self, idx):
+        start_idx, end_idx, seq_length = self.samples_mapping[idx]
+        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
+        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_id, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.masked_lm_prob, np_rng,
+                                     self.binary_head,
+                                     tokenizer=self.tokenizer,
+                                     masking_style=self.masking_style)
+
+
+def build_training_sample(sample,
+                          target_seq_length, max_seq_length,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_id, sep_id, mask_id, pad_id,
+                          masked_lm_prob, np_rng, binary_head,
+                          tokenizer,
+                          masking_style='bert'):
+    """Biuld training sample.
+
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_id: Start of example id.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        masked_lm_prob: Probability to mask tokens.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+    """
+
+    if binary_head:
+        # We assume that we have at least two sentences in the sample
+        assert len(sample) > 1
+    assert target_seq_length <= max_seq_length
+
+    # Divide sample into two segments (A and B).
+    if binary_head:
+        tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample,
+                                                                  np_rng)
+    else:
+        tokens_a = []
+        for j in range(len(sample)):
+            tokens_a.extend(sample[j])
+        tokens_b = []
+        is_next_random = False
+
+    if len(tokens_a) >= max_seq_length-3:
+        tokens_a = tokens_a[:max_seq_length-3]
+
+    # Truncate to `target_sequence_length`.
+    max_num_tokens = target_seq_length
+    ''''
+    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
+                                  len(tokens_b), max_num_tokens, np_rng)
+    '''
+
+    # Build tokens and toketypes.
+    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
+                                                      cls_id, sep_id)
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * max_num_tokens
+    (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng,
+        tokenizer=tokenizer,
+        masking_style=masking_style)
+
+    # Padding.
+    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
+        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                                   masked_labels, pad_id, max_seq_length)
+
+    train_sample = {
+        'input_ids': tokens_np,
+        'token_type_ids': tokentypes_np,
+        'labels': labels_np,
+        'next_sentence_label': int(is_next_random),
+        'attention_mask': padding_mask_np}
+    return train_sample
+
+
+def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                             masked_labels, pad_id, max_seq_length):
+    """Pad sequences and convert them to numpy."""
+
+    # Some checks.
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+    assert len(tokentypes) == num_tokens
+    assert len(masked_positions) == len(masked_labels)
+
+    # Tokens and token types.
+    filler = [pad_id] * padding_length
+    tokens_np = np.array(tokens + filler, dtype=np.int64)
+    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+    # Padding mask.
+    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                               dtype=np.int64)
+
+    # Lables and loss mask.
+    labels = [-100] * max_seq_length
+    loss_mask = [0] * max_seq_length
+    for i in range(len(masked_positions)):
+        assert masked_positions[i] < num_tokens
+        labels[masked_positions[i]] = masked_labels[i]
+        loss_mask[masked_positions[i]] = 1
+    labels_np = np.array(labels, dtype=np.int64)
+    loss_mask_np = np.array(loss_mask, dtype=np.int64)
+
+    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
diff --git a/fengshen/data/megatron_dataloader/blendable_dataset.py b/fengshen/data/megatron_dataloader/blendable_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee24d4056b86333a13d4926e79283a0bc96bbea3
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/blendable_dataset.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Blendable dataset."""
+
+import time
+
+import numpy as np
+import torch
+
+from fengshen.data.megatron_dataloader.utils import print_rank_0
+
+
+class BlendableDataset(torch.utils.data.Dataset):
+
+    def __init__(self, datasets, weights):
+
+        self.datasets = datasets
+        num_datasets = len(datasets)
+        assert num_datasets == len(weights)
+
+        self.size = 0
+        for dataset in self.datasets:
+            self.size += len(dataset)
+
+        # Normalize weights.
+        weights = np.array(weights, dtype=np.float64)
+        sum_weights = np.sum(weights)
+        assert sum_weights > 0.0
+        weights /= sum_weights
+
+        # Build indecies.
+        start_time = time.time()
+        assert num_datasets < 255
+        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
+        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+
+        from fengshen.data.megatron_dataloader import helpers
+        helpers.build_blending_indices(self.dataset_index,
+                                       self.dataset_sample_index,
+                                       weights, num_datasets, self.size,
+                                       torch.distributed.get_rank() == 0)
+        print_rank_0('> elapsed time for building blendable dataset indices: '
+                     '{:.2f} (sec)'.format(time.time() - start_time))
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx):
+        dataset_idx = self.dataset_index[idx]
+        sample_idx = self.dataset_sample_index[idx]
+        return self.datasets[dataset_idx][sample_idx]
diff --git a/fengshen/data/megatron_dataloader/dataset_utils.py b/fengshen/data/megatron_dataloader/dataset_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b579751573ff8ddf94882c032d4ed6cc168ba07
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/dataset_utils.py
@@ -0,0 +1,788 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Most of the code here has been copied from:
+#   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
+# with some modifications.
+
+import math
+import time
+import collections
+
+import numpy as np
+import re
+
+from fengshen.data.megatron_dataloader.utils import (
+    print_rank_0
+)
+from fengshen.data.megatron_dataloader.blendable_dataset import BlendableDataset
+from fengshen.data.megatron_dataloader.indexed_dataset import make_dataset as make_indexed_dataset
+
+DSET_TYPE_BERT = 'standard_bert'
+DSET_TYPE_ICT = 'ict'
+DSET_TYPE_T5 = 't5'
+DSET_TYPE_BERT_CN_WWM = 'bert_cn_wwm'
+DSET_TYPE_BART = 'bart'
+DSET_TYPE_COCOLM = 'coco_lm'
+
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT,
+              DSET_TYPE_T5, DSET_TYPE_BERT_CN_WWM,
+              DSET_TYPE_BART, DSET_TYPE_COCOLM]
+
+
+def get_datasets_weights_and_num_samples(data_prefix,
+                                         train_valid_test_num_samples):
+
+    # The data prefix should be in the format of:
+    #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
+    assert len(data_prefix) % 2 == 0
+    num_datasets = len(data_prefix) // 2
+    weights = [0] * num_datasets
+    prefixes = [0] * num_datasets
+    for i in range(num_datasets):
+        weights[i] = float(data_prefix[2 * i])
+        prefixes[i] = (data_prefix[2 * i + 1]).strip()
+    # Normalize weights
+    weight_sum = 0.0
+    for weight in weights:
+        weight_sum += weight
+    assert weight_sum > 0.0
+    weights = [weight / weight_sum for weight in weights]
+
+    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # not uniformly distribute the number of samples, we still have
+    # samples left to feed to the network.
+    datasets_train_valid_test_num_samples = []
+    for weight in weights:
+        datasets_train_valid_test_num_samples.append(
+            [int(math.ceil(val * weight * 1.005))
+             for val in train_valid_test_num_samples])
+
+    return prefixes, weights, datasets_train_valid_test_num_samples
+
+
+def compile_helper():
+    """Compile helper function ar runtime. Make sure this
+    is invoked on a single process."""
+    import os
+    import subprocess
+    path = os.path.abspath(os.path.dirname(__file__))
+    ret = subprocess.run(['make', '-C', path])
+    if ret.returncode != 0:
+        print("Making C++ dataset helpers module failed, exiting.")
+        import sys
+        sys.exit(1)
+
+
+def get_a_and_b_segments(sample, np_rng):
+    """Divide sample into a and b segments."""
+
+    # Number of sentences in the sample.
+    n_sentences = len(sample)
+    # Make sure we always have two sentences.
+    assert n_sentences > 1, 'make sure each sample has at least two sentences.'
+
+    # First part:
+    # `a_end` is how many sentences go into the `A`.
+    a_end = 1
+    if n_sentences >= 3:
+        # Note that randin in numpy is exclusive.
+        a_end = np_rng.randint(1, n_sentences)
+    tokens_a = []
+    for j in range(a_end):
+        tokens_a.extend(sample[j])
+
+    # Second part:
+    tokens_b = []
+    for j in range(a_end, n_sentences):
+        tokens_b.extend(sample[j])
+
+    # Random next:
+    is_next_random = False
+    if np_rng.random() < 0.5:
+        is_next_random = True
+        tokens_a, tokens_b = tokens_b, tokens_a
+
+    return tokens_a, tokens_b, is_next_random
+
+
+def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    # print(len_a, len_b, max_num_tokens)
+    assert len_a > 0
+    if len_a + len_b <= max_num_tokens:
+        return False
+    while len_a + len_b > max_num_tokens:
+        if len_a > len_b:
+            len_a -= 1
+            tokens = tokens_a
+        else:
+            len_b -= 1
+            tokens = tokens_b
+        if np_rng.random() < 0.5:
+            del tokens[0]
+        else:
+            tokens.pop()
+    return True
+
+
+def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
+    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
+
+    tokens = []
+    tokentypes = []
+    # [CLS].
+    tokens.append(cls_id)
+    tokentypes.append(0)
+    # Segment A.
+    for token in tokens_a:
+        tokens.append(token)
+        tokentypes.append(0)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(0)
+    # Segment B.
+    for token in tokens_b:
+        tokens.append(token)
+        tokentypes.append(1)
+    if tokens_b:
+        # [SEP].
+        tokens.append(sep_id)
+        tokentypes.append(1)
+
+    return tokens, tokentypes
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def is_start_piece(piece):
+    """Check if the current word piece is the starting piece (BERT)."""
+    # When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    return not piece.startswith("##")
+
+
+def create_masked_lm_predictions(tokens,
+                                 vocab_id_list, vocab_id_to_token_dict,
+                                 masked_lm_prob,
+                                 cls_id, sep_id, mask_id,
+                                 max_predictions_per_seq,
+                                 np_rng,
+                                 tokenizer,
+                                 max_ngrams=3,
+                                 do_whole_word_mask=True,
+                                 favor_longer_ngram=False,
+                                 do_permutation=False,
+                                 geometric_dist=False,
+                                 masking_style="bert",
+                                 zh_tokenizer=None):
+    """Creates the predictions for the masked LM objective.
+    Note: Tokens here are vocab ids and not text tokens."""
+
+    cand_indexes = []
+    # Note(mingdachen): We create a list for recording if the piece is
+    # the starting piece of current token, where 1 means true, so that
+    # on-the-fly whole word masking is possible.
+    token_boundary = [0] * len(tokens)
+
+    # 如果没有指定中文分词器，那就直接按##算
+    if zh_tokenizer is None:
+        for (i, token) in enumerate(tokens):
+            if token == cls_id or token == sep_id:
+                token_boundary[i] = 1
+                continue
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+            if (do_whole_word_mask and len(cand_indexes) >= 1 and
+                    not is_start_piece(vocab_id_to_token_dict[token])):
+                cand_indexes[-1].append(i)
+            else:
+                cand_indexes.append([i])
+                if is_start_piece(vocab_id_to_token_dict[token]):
+                    token_boundary[i] = 1
+    else:
+        # 如果指定了中文分词器，那就先用分词器分词，然后再进行判断
+        # 获取去掉CLS SEP的原始文本
+        raw_tokens = []
+        for t in tokens:
+            if t != cls_id and t != sep_id:
+                raw_tokens.append(t)
+        raw_tokens = [vocab_id_to_token_dict[i] for i in raw_tokens]
+        # 分词然后获取每次字开头的最长词的长度
+        word_list = set(zh_tokenizer(''.join(raw_tokens), HMM=True))
+        word_length_dict = {}
+        for w in word_list:
+            if len(w) < 1:
+                continue
+            if w[0] not in word_length_dict:
+                word_length_dict[w[0]] = len(w)
+            elif word_length_dict[w[0]] < len(w):
+                word_length_dict[w[0]] = len(w)
+        i = 0
+        # 从词表里面检索
+        while i < len(tokens):
+            token_id = tokens[i]
+            token = vocab_id_to_token_dict[token_id]
+            if len(token) == 0 or token_id == cls_id or token_id == sep_id:
+                token_boundary[i] = 1
+                i += 1
+                continue
+            word_max_length = 1
+            if token[0] in word_length_dict:
+                word_max_length = word_length_dict[token[0]]
+            j = 0
+            word = ''
+            word_end = i+1
+            # 兼容以前##的形式，如果后面的词是##开头的，那么直接把后面的拼到前面当作一个词
+            old_style = False
+            while word_end < len(tokens) and vocab_id_to_token_dict[tokens[word_end]].startswith('##'):
+                old_style = True
+                word_end += 1
+            if not old_style:
+                while j < word_max_length and i+j < len(tokens):
+                    cur_token = tokens[i+j]
+                    word += vocab_id_to_token_dict[cur_token]
+                    j += 1
+                    if word in word_list:
+                        word_end = i+j
+            cand_indexes.append([p for p in range(i, word_end)])
+            token_boundary[i] = 1
+            i = word_end
+
+    output_tokens = list(tokens)
+    # add by ganruyi
+    if masking_style == 'bert-cn-wwm':
+        # if non chinese is False, that means it is chinese
+        # then try to remove "##" which is added previously
+        new_token_ids = []
+        for token_id in output_tokens:
+            token = tokenizer.convert_ids_to_tokens([token_id])[0]
+            if len(re.findall('##[\u4E00-\u9FA5]', token)) > 0:
+                token = token[2:]
+            new_token_id = tokenizer.convert_tokens_to_ids([token])[
+                0]
+            new_token_ids.append(new_token_id)
+        output_tokens = new_token_ids
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+
+    if masked_lm_prob == 0:
+        return (output_tokens, masked_lm_positions,
+                masked_lm_labels, token_boundary)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
+    if not geometric_dist:
+        # Note(mingdachen):
+        # By default, we set the probilities to favor shorter ngram sequences.
+        pvals = 1. / np.arange(1, max_ngrams + 1)
+        pvals /= pvals.sum(keepdims=True)
+        if favor_longer_ngram:
+            pvals = pvals[::-1]
+    # 获取一个ngram的idx，对于每个word，记录他的ngram的word
+    ngram_indexes = []
+    for idx in range(len(cand_indexes)):
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+        ngram_indexes.append(ngram_index)
+
+    np_rng.shuffle(ngram_indexes)
+
+    (masked_lms, masked_spans) = ([], [])
+    covered_indexes = set()
+    for cand_index_set in ngram_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if not cand_index_set:
+            continue
+        # Note(mingdachen):
+        # Skip current piece if they are covered in lm masking or previous ngrams.
+        for index_set in cand_index_set[0]:
+            for index in index_set:
+                if index in covered_indexes:
+                    continue
+
+        if not geometric_dist:
+            n = np_rng.choice(ngrams[:len(cand_index_set)],
+                              p=pvals[:len(cand_index_set)] /
+                              pvals[:len(cand_index_set)].sum(keepdims=True))
+        else:
+            # Sampling "n" from the geometric distribution and clipping it to
+            # the max_ngrams. Using p=0.2 default from the SpanBERT paper
+            # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
+            n = min(np_rng.geometric(0.2), max_ngrams)
+
+        index_set = sum(cand_index_set[n - 1], [])
+        n -= 1
+        # Note(mingdachen):
+        # Repeatedly looking for a candidate that does not exceed the
+        # maximum number of predictions by trying shorter ngrams.
+        while len(masked_lms) + len(index_set) > num_to_predict:
+            if n == 0:
+                break
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_predict:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+            masked_token = None
+            if masking_style == "bert":
+                # 80% of the time, replace with [MASK]
+                if np_rng.random() < 0.8:
+                    masked_token = mask_id
+                else:
+                    # 10% of the time, keep original
+                    if np_rng.random() < 0.5:
+                        masked_token = tokens[index]
+                    # 10% of the time, replace with random word
+                    else:
+                        masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
+            elif masking_style == 'bert-cn-wwm':
+                # 80% of the time, replace with [MASK]
+                if np_rng.random() < 0.8:
+                    masked_token = mask_id
+                else:
+                    # 10% of the time, keep original
+                    if np_rng.random() < 0.5:
+                        # 如果是中文全词mask，去掉tokens里的##
+                        token_id = tokens[index]
+                        token = tokenizer.convert_ids_to_tokens([token_id])[
+                            0]
+                        if len(re.findall('##[\u4E00-\u9FA5]', token)) > 0:
+                            token = token[2:]
+                        new_token_id = tokenizer.convert_tokens_to_ids([token])[
+                            0]
+                        masked_token = new_token_id
+                    # 10% of the time, replace with random word
+                    else:
+                        masked_token = vocab_id_list[np_rng.randint(
+                            0, len(vocab_id_list))]
+            elif masking_style == "t5":
+                masked_token = mask_id
+            else:
+                raise ValueError("invalid value of masking style")
+
+            output_tokens[index] = masked_token
+            masked_lms.append(MaskedLmInstance(
+                index=index, label=tokens[index]))
+
+        masked_spans.append(MaskedLmInstance(
+            index=index_set,
+            label=[tokens[index] for index in index_set]))
+
+    assert len(masked_lms) <= num_to_predict
+    np_rng.shuffle(ngram_indexes)
+
+    select_indexes = set()
+    if do_permutation:
+        for cand_index_set in ngram_indexes:
+            if len(select_indexes) >= num_to_predict:
+                break
+            if not cand_index_set:
+                continue
+            # Note(mingdachen):
+            # Skip current piece if they are covered in lm masking or previous ngrams.
+            for index_set in cand_index_set[0]:
+                for index in index_set:
+                    if index in covered_indexes or index in select_indexes:
+                        continue
+
+            n = np.random.choice(ngrams[:len(cand_index_set)],
+                                 p=pvals[:len(cand_index_set)] /
+                                 pvals[:len(cand_index_set)].sum(keepdims=True))
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+
+            while len(select_indexes) + len(index_set) > num_to_predict:
+                if n == 0:
+                    break
+                index_set = sum(cand_index_set[n - 1], [])
+                n -= 1
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(select_indexes) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                if index in covered_indexes or index in select_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                select_indexes.add(index)
+        assert len(select_indexes) <= num_to_predict
+
+        select_indexes = sorted(select_indexes)
+        permute_indexes = list(select_indexes)
+        np_rng.shuffle(permute_indexes)
+        orig_token = list(output_tokens)
+
+        for src_i, tgt_i in zip(select_indexes, permute_indexes):
+            output_tokens[src_i] = orig_token[tgt_i]
+            masked_lms.append(MaskedLmInstance(
+                index=src_i, label=orig_token[src_i]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    # Sort the spans by the index of the first span
+    masked_spans = sorted(masked_spans, key=lambda x: x.index[0])
+
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+    return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans)
+
+
+def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                             masked_labels, pad_id, max_seq_length):
+    """Pad sequences and convert them to numpy."""
+
+    # Some checks.
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+    assert len(tokentypes) == num_tokens
+    assert len(masked_positions) == len(masked_labels)
+
+    # Tokens and token types.
+    filler = [pad_id] * padding_length
+    tokens_np = np.array(tokens + filler, dtype=np.int64)
+    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+    # Padding mask.
+    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                               dtype=np.int64)
+
+    # Lables and loss mask.
+    labels = [-1] * max_seq_length
+    loss_mask = [0] * max_seq_length
+    for i in range(len(masked_positions)):
+        assert masked_positions[i] < num_tokens
+        labels[masked_positions[i]] = masked_labels[i]
+        loss_mask[masked_positions[i]] = 1
+    labels_np = np.array(labels, dtype=np.int64)
+    loss_mask_np = np.array(loss_mask, dtype=np.int64)
+
+    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
+
+
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    max_seq_length,
+                                    masked_lm_prob, short_seq_prob, seed,
+                                    tokenizer,
+                                    skip_warmup, binary_head=False,
+                                    max_seq_length_dec=None,
+                                    dataset_type='standard_bert',
+                                    zh_tokenizer=None,
+                                    span=None):
+
+    if len(data_prefix) == 1:
+        return _build_train_valid_test_datasets(data_prefix[0],
+                                                data_impl, splits_string,
+                                                train_valid_test_num_samples,
+                                                max_seq_length, masked_lm_prob,
+                                                short_seq_prob, seed,
+                                                skip_warmup,
+                                                binary_head,
+                                                max_seq_length_dec,
+                                                tokenizer,
+                                                dataset_type=dataset_type,
+                                                zh_tokenizer=zh_tokenizer,
+                                                span=span)
+    # Blending dataset.
+    # Parse the values.
+    output = get_datasets_weights_and_num_samples(data_prefix,
+                                                  train_valid_test_num_samples)
+    prefixes, weights, datasets_train_valid_test_num_samples = output
+
+    # Build individual datasets.
+    train_datasets = []
+    valid_datasets = []
+    test_datasets = []
+    for i in range(len(prefixes)):
+        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+            prefixes[i], data_impl, splits_string,
+            datasets_train_valid_test_num_samples[i],
+            max_seq_length, masked_lm_prob, short_seq_prob,
+            seed, skip_warmup, binary_head, max_seq_length_dec,
+            tokenizer, dataset_type=dataset_type, zh_tokenizer=zh_tokenizer)
+        if train_ds:
+            train_datasets.append(train_ds)
+        if valid_ds:
+            valid_datasets.append(valid_ds)
+        if test_ds:
+            test_datasets.append(test_ds)
+
+        # Blend.
+    blending_train_dataset = None
+    if train_datasets:
+        blending_train_dataset = BlendableDataset(train_datasets, weights)
+    blending_valid_dataset = None
+    if valid_datasets:
+        blending_valid_dataset = BlendableDataset(valid_datasets, weights)
+    blending_test_dataset = None
+    if test_datasets:
+        blending_test_dataset = BlendableDataset(test_datasets, weights)
+
+    return (blending_train_dataset, blending_valid_dataset,
+            blending_test_dataset)
+
+
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     max_seq_length,
+                                     masked_lm_prob, short_seq_prob, seed,
+                                     skip_warmup, binary_head,
+                                     max_seq_length_dec,
+                                     tokenizer,
+                                     dataset_type='standard_bert',
+                                     zh_tokenizer=None,
+                                     span=None):
+
+    if dataset_type not in DSET_TYPES:
+        raise ValueError("Invalid dataset_type: ", dataset_type)
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    # Get start and end indices of train/valid/train into doc-idx
+    # Note that doc-idx is desinged to be num-docs + 1 so we can
+    # easily iterate over it.
+    total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+        start_index = indexed_dataset.doc_idx[splits[index]]
+        end_index = indexed_dataset.doc_idx[splits[index + 1]]
+        print_rank_0('     sentence indices in [{}, {}) total of {} '
+                     'sentences'.format(start_index, end_index,
+                                        end_index - start_index))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        from fengshen.data.megatron_dataloader.bert_dataset import BertDataset
+        from fengshen.data.megatron_dataloader.bart_dataset import BartDataset
+        from fengshen.data.megatron_dataloader.cocolm_dataset import COCOLMDataset
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            # Get the pointer to the original doc-idx so we can set it later.
+            doc_idx_ptr = indexed_dataset.get_doc_idx()
+            # Slice the doc-idx
+            start_index = splits[index]
+            # Add +1 so we can index into the dataset to get the upper bound.
+            end_index = splits[index + 1] + 1
+            # New doc_idx view.
+            indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
+            # Build the dataset accordingly.
+            kwargs = dict(
+                name=name,
+                data_prefix=data_prefix,
+                num_epochs=None,
+                max_num_samples=train_valid_test_num_samples[index],
+                max_seq_length=max_seq_length,
+                seed=seed,
+            )
+
+            if dataset_type == DSET_TYPE_BERT or dataset_type == DSET_TYPE_BERT_CN_WWM:
+                dataset = BertDataset(
+                    indexed_dataset=indexed_dataset,
+                    masked_lm_prob=masked_lm_prob,
+                    short_seq_prob=short_seq_prob,
+                    binary_head=binary_head,
+                    # 增加参数区分bert和bert-cn-wwm
+                    tokenizer=tokenizer,
+                    masking_style='bert' if dataset_type == DSET_TYPE_BERT else 'bert-cn-wwm',
+                    **kwargs
+                )
+            elif dataset_type == DSET_TYPE_BART:
+                dataset = BartDataset(
+                    indexed_dataset=indexed_dataset,
+                    masked_lm_prob=masked_lm_prob,
+                    short_seq_prob=short_seq_prob,
+                    tokenizer=tokenizer,
+                    zh_tokenizer=zh_tokenizer,
+                    **kwargs
+                )
+            elif dataset_type == DSET_TYPE_COCOLM:
+                dataset = COCOLMDataset(
+                    indexed_dataset=indexed_dataset,
+                    masked_lm_prob=masked_lm_prob,
+                    short_seq_prob=short_seq_prob,
+                    tokenizer=tokenizer,
+                    masking_style='bert',
+                    span=span,
+                    **kwargs
+                )
+            else:
+                raise NotImplementedError(
+                    "Dataset type not fully implemented.")
+
+            # Set the original pointer so dataset remains the main dataset.
+            indexed_dataset.set_doc_idx(doc_idx_ptr)
+            # Checks.
+            assert indexed_dataset.doc_idx[0] == 0
+            assert indexed_dataset.doc_idx.shape[0] == \
+                (total_num_of_documents + 1)
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+
+    print_rank_0(' > building dataset index ...')
+
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+
+    print_rank_0(' > indexed dataset stats:')
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.doc_idx.shape[0] - 1))
+    print_rank_0('    number of sentences: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """ Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(',') != -1:
+        splits = [float(s) for s in splits_string.split(',')]
+    elif splits_string.find('/') != -1:
+        splits = [float(s) for s in splits_string.split('/')]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] +
+                            int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+
+
+def get_samples_mapping(indexed_dataset,
+                        data_prefix,
+                        num_epochs,
+                        max_num_samples,
+                        max_seq_length,
+                        short_seq_prob,
+                        seed,
+                        name,
+                        binary_head):
+    """Get a list that maps a sample index to a starting
+    sentence index, end sentence index, and length"""
+
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
+    indexmap_filename += '_{}s'.format(seed)
+    indexmap_filename += '.npy'
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    # ganruyi comment
+    # counts = torch.cuda.LongTensor([1])
+    # torch.distributed.all_reduce(
+    # counts, group=mpu.get_data_parallel_group())
+    # torch.distributed.all_reduce(
+    # counts, group=mpu.get_pipeline_model_parallel_group())
+    # assert counts[0].item() == (
+    #    torch.distributed.get_world_size() //
+    #    torch.distributed.get_world_size(
+    # group=mpu.get_tensor_model_parallel_group()))
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+    samples_mapping = np.load(
+        indexmap_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        samples_mapping.shape[0]))
+
+    return samples_mapping
diff --git a/fengshen/data/megatron_dataloader/helpers.cpp b/fengshen/data/megatron_dataloader/helpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31277dd1ce3a449bf962ba5a4d6343e7a9c0b5f9
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/helpers.cpp
@@ -0,0 +1,794 @@
+/*
+ coding=utf-8
+ Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+/* Helper methods for fast index mapping builds */
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <random>
+
+namespace py = pybind11;
+using namespace std;
+
+const int32_t LONG_SENTENCE_LEN = 512;
+
+void build_blending_indices(py::array_t<uint8_t> &dataset_index,
+                            py::array_t<int64_t> &dataset_sample_index,
+                            const py::array_t<double> &weights,
+                            const int32_t num_datasets,
+                            const int64_t size, const bool verbose)
+{
+  /* Given multiple datasets and a weighting array, build samples
+   such that it follows those wieghts.*/
+
+  if (verbose)
+  {
+    std::cout << "> building indices for blendable datasets ..." << std::endl;
+  }
+
+  // Get the pointer access without the checks.
+  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+  auto weights_ptr = weights.unchecked<1>();
+
+  // Initialize buffer for number of samples used for each dataset.
+  int64_t current_samples[num_datasets];
+  for (int64_t i = 0; i < num_datasets; ++i)
+  {
+    current_samples[i] = 0;
+  }
+
+  // For each sample:
+  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx)
+  {
+
+    // Determine where the max error in sampling is happening.
+    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
+    int64_t max_error_index = 0;
+    double max_error = weights_ptr[0] * sample_idx_double -
+                       static_cast<double>(current_samples[0]);
+    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx)
+    {
+      double error = weights_ptr[dataset_idx] * sample_idx_double -
+                     static_cast<double>(current_samples[dataset_idx]);
+      if (error > max_error)
+      {
+        max_error = error;
+        max_error_index = dataset_idx;
+      }
+    }
+
+    // Populate the indices.
+    dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
+    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
+
+    // Update the total samples.
+    current_samples[max_error_index] += 1;
+  }
+
+  // print info
+  if (verbose)
+  {
+    std::cout << " > sample ratios:" << std::endl;
+    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx)
+    {
+      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
+                   static_cast<double>(size);
+      std::cout << "   dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
+    }
+  }
+}
+
+py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
+                           const py::array_t<int32_t> &doc_idx_,
+                           const int32_t seq_length,
+                           const int32_t num_epochs,
+                           const int64_t tokens_per_epoch)
+{
+  /* Sample index (sample_idx) is used for gpt2 like dataset for which
+       the documents are flattened and the samples are built based on this
+       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+       starting offset in that document.*/
+
+  // Consistency checks.
+  assert(seq_length > 1);
+  assert(num_epochs > 0);
+  assert(tokens_per_epoch > 1);
+
+  // Remove bound checks.
+  auto sizes = sizes_.unchecked<1>();
+  auto doc_idx = doc_idx_.unchecked<1>();
+
+  // Mapping and it's length (1D).
+  int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
+  int32_t *sample_idx = new int32_t[2 * (num_samples + 1)];
+
+  cout << "    using:" << endl
+       << std::flush;
+  cout << "     number of documents:       " << doc_idx_.shape(0) / num_epochs << endl
+       << std::flush;
+  cout << "     number of epochs:          " << num_epochs << endl
+       << std::flush;
+  cout << "     sequence length:           " << seq_length << endl
+       << std::flush;
+  cout << "     total number of samples:   " << num_samples << endl
+       << std::flush;
+
+  // Index into sample_idx.
+  int64_t sample_index = 0;
+  // Index into doc_idx.
+  int64_t doc_idx_index = 0;
+  // Begining offset for each document.
+  int32_t doc_offset = 0;
+  // Start with first document and no offset.
+  sample_idx[2 * sample_index] = doc_idx_index;
+  sample_idx[2 * sample_index + 1] = doc_offset;
+  ++sample_index;
+
+  while (sample_index <= num_samples)
+  {
+    // Start with a fresh sequence.
+    int32_t remaining_seq_length = seq_length + 1;
+    while (remaining_seq_length != 0)
+    {
+      // Get the document length.
+      auto doc_id = doc_idx[doc_idx_index];
+      auto doc_length = sizes[doc_id] - doc_offset;
+      // And add it to the current sequence.
+      remaining_seq_length -= doc_length;
+      // If we have more than a full sequence, adjust offset and set
+      // remaining length to zero so we return from the while loop.
+      // Note that -1 here is for the same reason we have -1 in
+      // `_num_epochs` calculations.
+      if (remaining_seq_length <= 0)
+      {
+        doc_offset += (remaining_seq_length + doc_length - 1);
+        remaining_seq_length = 0;
+      }
+      else
+      {
+        // Otherwise, start from the begining of the next document.
+        ++doc_idx_index;
+        doc_offset = 0;
+      }
+    }
+    // Record the sequence.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(sample_idx, [](void *mem_)
+                             {
+                               int32_t *mem = reinterpret_cast<int32_t *>(mem_);
+                               delete[] mem;
+                             });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(int32_t);
+  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
+                   {2 * byte_size, byte_size},               // C-style contiguous strides
+                   sample_idx,                               // the data pointer
+                   free_when_done);                          // numpy array references
+}
+
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+                                     const int32_t max_length,
+                                     std::mt19937 &rand32_gen)
+{
+  /* Training sample length. */
+  if (short_seq_ratio == 0)
+  {
+    return max_length;
+  }
+  const auto random_number = rand32_gen();
+  if ((random_number % short_seq_ratio) == 0)
+  {
+    return 2 + random_number % (max_length - 1);
+  }
+  return max_length;
+}
+
+template <typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
+                             const py::array_t<int32_t> &sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+                             const bool verbose,
+                             const int32_t min_num_sent)
+{
+  /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+  // Consistency checks.
+  assert(num_epochs > 0);
+  assert(max_seq_length > 1);
+  assert(short_seq_prob >= 0.0);
+  assert(short_seq_prob <= 1.0);
+  assert(seed > 0);
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+
+  // For efficiency, convert probability to ratio. Note: rand() generates int.
+  int32_t short_seq_ratio = 0;
+  if (short_seq_prob > 0)
+  {
+    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+  }
+
+  if (verbose)
+  {
+    const auto sent_start_index = docs[0];
+    const auto sent_end_index = docs[docs_.shape(0) - 1];
+    const auto num_sentences = sent_end_index - sent_start_index;
+    cout << "    using:" << endl
+         << std::flush;
+    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
+         << std::flush;
+    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
+         << std::flush;
+    cout << "     total number of sentences:      " << num_sentences << endl
+         << std::flush;
+    cout << "     number of epochs:               " << num_epochs << endl
+         << std::flush;
+    cout << "     maximum number of samples:      " << max_num_samples << endl
+         << std::flush;
+    cout << "     maximum sequence length:        " << max_seq_length << endl
+         << std::flush;
+    cout << "     short sequence probability:     " << short_seq_prob << endl
+         << std::flush;
+    cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
+         << std::flush;
+    cout << "     seed:                           " << seed << endl
+         << std::flush;
+  }
+
+  // Mapping and it's length (1D).
+  int64_t num_samples = -1;
+  DocIdx *maps = NULL;
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int32_t iteration = 0; iteration < 2; ++iteration)
+  {
+
+    // Set the seed so both iterations produce the same results.
+    std::mt19937 rand32_gen(seed);
+
+    // Set the flag on second iteration.
+    second = (iteration == 1);
+
+    // Counters:
+    uint64_t empty_docs = 0;
+    uint64_t one_sent_docs = 0;
+    uint64_t long_sent_docs = 0;
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    // For each epoch:
+    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
+    {
+      if (map_index >= max_num_samples)
+      {
+        if (verbose && (!second))
+        {
+          cout << "    reached " << max_num_samples << " samples after "
+               << epoch << " epochs ..." << endl
+               << std::flush;
+        }
+        break;
+      }
+      // For each document:
+      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
+      {
+
+        // Document sentences are in [sent_index_first, sent_index_last)
+        const auto sent_index_first = docs[doc];
+        const auto sent_index_last = docs[doc + 1];
+
+        // At the begining of the document previous index is the
+        // start index.
+        auto prev_start_index = sent_index_first;
+
+        // Remaining documents.
+        auto num_remain_sent = sent_index_last - sent_index_first;
+
+        // Some bookkeeping
+        if ((epoch == 0) && (!second))
+        {
+          if (num_remain_sent == 0)
+          {
+            ++empty_docs;
+          }
+          if (num_remain_sent == 1)
+          {
+            ++one_sent_docs;
+          }
+        }
+
+        // Detect documents with long sentences.
+        bool contains_long_sentence = false;
+        if (num_remain_sent > 1)
+        {
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+            if (sizes[sent_index] > LONG_SENTENCE_LEN)
+            {
+              if ((epoch == 0) && (!second))
+              {
+                ++long_sent_docs;
+              }
+              contains_long_sentence = true;
+              break;
+            }
+          }
+        }
+
+        // If we have more than two sentences.
+        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
+        {
+
+          // Set values.
+          auto seq_len = int32_t{0};
+          auto num_sent = int32_t{0};
+          auto target_seq_len = get_target_sample_len(short_seq_ratio,
+                                                      max_seq_length,
+                                                      rand32_gen);
+
+          // Loop through sentences.
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+
+            // Add the size and number of sentences.
+            seq_len += sizes[sent_index];
+            ++num_sent;
+            --num_remain_sent;
+
+            // If we have reached the target length.
+            // and if not only one sentence is left in the document.
+            // and if we have at least two sentneces.
+            // and if we have reached end of the document.
+            if (((seq_len >= target_seq_len) &&
+                 (num_remain_sent > 1) &&
+                 (num_sent >= min_num_sent)) ||
+                (num_remain_sent == 0))
+            {
+
+              // Check for overflow.
+              if ((3 * map_index + 2) >
+                  std::numeric_limits<int64_t>::max())
+              {
+                cout << "number of samples exceeded maximum "
+                     << "allowed by type int64: "
+                     << std::numeric_limits<int64_t>::max()
+                     << endl;
+                throw std::overflow_error("Number of samples");
+              }
+
+              // Populate the map.
+              if (second)
+              {
+                const auto map_index_0 = 3 * map_index;
+                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+              }
+
+              // Update indices / counters.
+              ++map_index;
+              prev_start_index = sent_index + 1;
+              target_seq_len = get_target_sample_len(short_seq_ratio,
+                                                     max_seq_length,
+                                                     rand32_gen);
+              seq_len = 0;
+              num_sent = 0;
+            }
+
+          } // for (auto sent_index=sent_index_first; ...
+        }   // if (num_remain_sent > 1) {
+      }     // for (int doc=0; doc < num_docs; ++doc) {
+    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    if (!second)
+    {
+      if (verbose)
+      {
+        cout << "   number of empty documents: " << empty_docs << endl
+             << std::flush;
+        cout << "   number of documents with one sentence: " << one_sent_docs << endl
+             << std::flush;
+        cout << "   number of documents with long sentences: " << long_sent_docs << endl
+             << std::flush;
+        cout << "   will create mapping for " << map_index << " samples" << endl
+             << std::flush;
+      }
+      assert(maps == NULL);
+      assert(num_samples < 0);
+      maps = new DocIdx[3 * map_index];
+      num_samples = static_cast<int64_t>(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  // We need a 64 bit random number generator as we might have more
+  // than 2 billion samples.
+  std::mt19937_64 rand64_gen(seed + 1);
+  for (auto i = (num_samples - 1); i > 0; --i)
+  {
+    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+    const auto i0 = 3 * i;
+    const auto j0 = 3 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_)
+                             {
+                               DocIdx *mem = reinterpret_cast<DocIdx *>(mem_);
+                               delete[] mem;
+                             });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(DocIdx);
+  return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                   {3 * byte_size, byte_size},           // C-style contiguous strides
+                   maps,                                 // the data pointer
+                   free_when_done);                      // numpy array references
+}
+
+py::array build_mapping(const py::array_t<int64_t> &docs_,
+                        const py::array_t<int> &sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+                        const bool verbose,
+                        const int32_t min_num_sent)
+{
+
+  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
+  {
+    if (verbose)
+    {
+      cout << "    using uint64 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+                                        max_num_samples, max_seq_length,
+                                        short_seq_prob, seed, verbose,
+                                        min_num_sent);
+  }
+  else
+  {
+    if (verbose)
+    {
+      cout << "    using uint32 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+                                        max_num_samples, max_seq_length,
+                                        short_seq_prob, seed, verbose,
+                                        min_num_sent);
+  }
+}
+
+template <typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
+                                    const py::array_t<int32_t> &sizes_,
+                                    const py::array_t<int32_t> &titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose,
+                                    const bool use_one_sent_blocks)
+{
+  /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+  // Consistency checks.
+  assert(num_epochs > 0);
+  assert(max_seq_length > 1);
+  assert(seed > 0);
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+  auto titles_sizes = titles_sizes_.unchecked<1>();
+
+  if (verbose)
+  {
+    const auto sent_start_index = docs[0];
+    const auto sent_end_index = docs[docs_.shape(0) - 1];
+    const auto num_sentences = sent_end_index - sent_start_index;
+    cout << "    using:" << endl
+         << std::flush;
+    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
+         << std::flush;
+    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
+         << std::flush;
+    cout << "     total number of sentences:      " << num_sentences << endl
+         << std::flush;
+    cout << "     number of epochs:               " << num_epochs << endl
+         << std::flush;
+    cout << "     maximum number of samples:      " << max_num_samples << endl
+         << std::flush;
+    cout << "     maximum sequence length:        " << max_seq_length << endl
+         << std::flush;
+    cout << "     seed:                           " << seed << endl
+         << std::flush;
+  }
+
+  // Mapping and its length (1D).
+  int64_t num_samples = -1;
+  DocIdx *maps = NULL;
+
+  // Acceptable number of sentences per block.
+  int min_num_sent = 2;
+  if (use_one_sent_blocks)
+  {
+    min_num_sent = 1;
+  }
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int32_t iteration = 0; iteration < 2; ++iteration)
+  {
+
+    // Set the flag on second iteration.
+    second = (iteration == 1);
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    uint64_t empty_docs = 0;
+    uint64_t one_sent_docs = 0;
+    uint64_t long_sent_docs = 0;
+    // For each epoch:
+    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
+    {
+      // assign every block a unique id
+      int32_t block_id = 0;
+
+      if (map_index >= max_num_samples)
+      {
+        if (verbose && (!second))
+        {
+          cout << "    reached " << max_num_samples << " samples after "
+               << epoch << " epochs ..." << endl
+               << std::flush;
+        }
+        break;
+      }
+      // For each document:
+      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
+      {
+
+        // Document sentences are in [sent_index_first, sent_index_last)
+        const auto sent_index_first = docs[doc];
+        const auto sent_index_last = docs[doc + 1];
+        const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+        // At the begining of the document previous index is the
+        // start index.
+        auto prev_start_index = sent_index_first;
+
+        // Remaining documents.
+        auto num_remain_sent = sent_index_last - sent_index_first;
+
+        // Some bookkeeping
+        if ((epoch == 0) && (!second))
+        {
+          if (num_remain_sent == 0)
+          {
+            ++empty_docs;
+          }
+          if (num_remain_sent == 1)
+          {
+            ++one_sent_docs;
+          }
+        }
+        // Detect documents with long sentences.
+        bool contains_long_sentence = false;
+        if (num_remain_sent >= min_num_sent)
+        {
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+            if (sizes[sent_index] > LONG_SENTENCE_LEN)
+            {
+              if ((epoch == 0) && (!second))
+              {
+                ++long_sent_docs;
+              }
+              contains_long_sentence = true;
+              break;
+            }
+          }
+        }
+        // If we have enough sentences and no long sentences.
+        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
+        {
+
+          // Set values.
+          auto seq_len = int32_t{0};
+          auto num_sent = int32_t{0};
+
+          // Loop through sentences.
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+
+            // Add the size and number of sentences.
+            seq_len += sizes[sent_index];
+            ++num_sent;
+            --num_remain_sent;
+
+            // If we have reached the target length.
+            // and there are an acceptable number of sentences left
+            // and if we have at least the minimum number of sentences.
+            // or if we have reached end of the document.
+            if (((seq_len >= target_seq_len) &&
+                 (num_remain_sent >= min_num_sent) &&
+                 (num_sent >= min_num_sent)) ||
+                (num_remain_sent == 0))
+            {
+
+              // Populate the map.
+              if (second)
+              {
+                const auto map_index_0 = 4 * map_index;
+                // Each sample has 4 items: the starting sentence index, ending sentence index,
+                // the index of the document from which the block comes (used for fetching titles)
+                // and the unique id of the block (used for creating block indexes)
+
+                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
+              }
+
+              // Update indices / counters.
+              ++map_index;
+              ++block_id;
+              prev_start_index = sent_index + 1;
+              seq_len = 0;
+              num_sent = 0;
+            }
+          } // for (auto sent_index=sent_index_first; ...
+        }   // if (num_remain_sent > 1) {
+      }     // for (int doc=0; doc < num_docs; ++doc) {
+    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    if (!second)
+    {
+      if (verbose)
+      {
+        cout << "   number of empty documents: " << empty_docs << endl
+             << std::flush;
+        cout << "   number of documents with one sentence: " << one_sent_docs << endl
+             << std::flush;
+        cout << "   number of documents with long sentences: " << long_sent_docs << endl
+             << std::flush;
+        cout << "   will create mapping for " << map_index << " samples" << endl
+             << std::flush;
+      }
+      assert(maps == NULL);
+      assert(num_samples < 0);
+      maps = new DocIdx[4 * map_index];
+      num_samples = static_cast<int64_t>(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  // We need a 64 bit random number generator as we might have more
+  // than 2 billion samples.
+  std::mt19937_64 rand64_gen(seed + 1);
+  for (auto i = (num_samples - 1); i > 0; --i)
+  {
+    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+    const auto i0 = 4 * i;
+    const auto j0 = 4 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+    swap(maps[i0 + 3], maps[j0 + 3]);
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_)
+                             {
+                               DocIdx *mem = reinterpret_cast<DocIdx *>(mem_);
+                               delete[] mem;
+                             });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(DocIdx);
+  return py::array(std::vector<int64_t>{num_samples, 4}, // shape
+                   {4 * byte_size, byte_size},           // C-style contiguous strides
+                   maps,                                 // the data pointer
+                   free_when_done);                      // numpy array references
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
+                               const py::array_t<int> &sizes_,
+                               const py::array_t<int> &titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                               const bool verbose,
+                               const bool use_one_sent_blocks)
+{
+
+  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
+  {
+    if (verbose)
+    {
+      cout << "    using uint64 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
+                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+  }
+  else
+  {
+    if (verbose)
+    {
+      cout << "    using uint32 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
+                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+  }
+}
+
+PYBIND11_MODULE(helpers, m)
+{
+  m.def("build_mapping", &build_mapping);
+  m.def("build_blocks_mapping", &build_blocks_mapping);
+  m.def("build_sample_idx", &build_sample_idx);
+  m.def("build_blending_indices", &build_blending_indices);
+}
diff --git a/fengshen/data/megatron_dataloader/indexed_dataset.py b/fengshen/data/megatron_dataloader/indexed_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eba91d303ab11884d993b707ca1d166f540588b
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/indexed_dataset.py
@@ -0,0 +1,585 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+
+import numpy as np
+import torch
+from fengshen.data.megatron_dataloader.utils import print_rank_0
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and "
+              ".bin can be appended to get full filenames.")
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file,
+                                         dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, skip_warmup=False):
+    if not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx "
+              "and .bin can be appended to get full filenames.")
+        return None
+    if impl == 'infer':
+        impl = infer_dataset_impl(path)
+    if impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float,
+    7: np.double,
+    8: np.uint16
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1,)
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.doc_count = struct.unpack('<Q', f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[
+                self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError(
+                    "Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(
+                data_file_path(path))
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx: ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[
+                self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx: ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float: 4,
+        np.double: 8
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, tensor):
+        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
+        self.data_offsets.append(
+            self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(
+            self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack('<Q', len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int32,
+                count=self._len,
+                offset=offset)
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(self._bin_buffer,
+                                           dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64, count=self._doc_count,
+                offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(
+            data_file_path(self._path), mode='r', order='C')
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=size, offset=ptr)
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError(
+                    "Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """ Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                 count=length, offset=ptr)
+        return np_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(
+                data_file_path(path))
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, 'wb', buffering=5000000)
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        for size in index.sizes:
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
diff --git a/fengshen/data/megatron_dataloader/utils.py b/fengshen/data/megatron_dataloader/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9258f4830fb22333b37603439da8f8116cd7a048
--- /dev/null
+++ b/fengshen/data/megatron_dataloader/utils.py
@@ -0,0 +1,24 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
diff --git a/fengshen/data/mmap_dataloader/mmap_datamodule.py b/fengshen/data/mmap_dataloader/mmap_datamodule.py
new file mode 100644
index 0000000000000000000000000000000000000000..534cfb179b649a317253685848e88aebeaea7e0f
--- /dev/null
+++ b/fengshen/data/mmap_dataloader/mmap_datamodule.py
@@ -0,0 +1,68 @@
+from typing import Optional
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader
+from fengshen.data.mmap_index_dataset import MMapIndexDataset
+
+
+class MMapDataModule(LightningDataModule):
+    @ staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('MMAP DataModule')
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_batchsize', default=32, type=int)
+        parser.add_argument('--eval_batchsize', default=32, type=int)
+        parser.add_argument('--test_batchsize', default=32, type=int)
+        parser.add_argument('--train_datas', default=[
+            './train_datas'
+        ], type=str, nargs='+')
+        parser.add_argument('--valid_datas', default=[
+            './valid_datas'
+        ], type=str, nargs='+')
+        parser.add_argument('--test_datas', default=[
+            './test_datas'],
+            type=str, nargs='+')
+        parser.add_argument('--input_tensor_name', default=['input_ids'], type=str, nargs='+')
+        return parent_args
+
+    def __init__(
+        self,
+        collate_fn,
+        args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.collate_fn = collate_fn
+        self.train_dataset = MMapIndexDataset(args.train_datas, args.input_tensor_name)
+        self.valid_dataset = MMapIndexDataset(args.valid_datas, args.input_tensor_name)
+        self.test_dataset = MMapIndexDataset(args.test_datas, args.input_tensor_name)
+        self.save_hyperparameters(args)
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        return super().setup(stage)
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.hparams.train_batchsize,
+            shuffle=True,
+            num_workers=self.hparams.num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.valid_dataset,
+            batch_size=self.hparams.eval_batchsize,
+            shuffle=True,
+            num_workers=self.hparams.num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.hparams.test_batchsize,
+            shuffle=True,
+            num_workers=self.hparams.num_workers,
+            collate_fn=self.collate_fn,
+        )
diff --git a/fengshen/data/mmap_dataloader/mmap_index_dataset.py b/fengshen/data/mmap_dataloader/mmap_index_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..53b290c12a8825a483f14ca0535a813b36477fa1
--- /dev/null
+++ b/fengshen/data/mmap_dataloader/mmap_index_dataset.py
@@ -0,0 +1,53 @@
+import numpy as np
+import torch
+from typing import List
+from torch.utils.data import Dataset
+
+
+class MMapIndexDataset(Dataset):
+    # datapaths 是所有的内存映射文件的路径
+    # input_tensor_name 是输入的tensor的名字 例如 ['input_ids'] 会存储在对应的文件里面
+    def __init__(self, datapaths: List[str], input_tensor_name: List[str]):
+        dict_idx_fp = {}
+        dict_bin_fp = {}
+        idx_len = []
+        for tensor_name in input_tensor_name:
+            idx_fp = []
+            bin_fp = []
+            len = 0
+            for data_path in datapaths:
+                idx_fp += [np.load(
+                    data_path + '_' + tensor_name + '.npy', mmap_mode='r')]
+                bin_fp += [np.memmap(
+                    data_path + '_' + tensor_name + '.bin',
+                    dtype='long',
+                    mode='r')]
+                len += idx_fp[-1].shape[0]
+                idx_len += [idx_fp[-1].shape[0]]
+            dict_idx_fp[tensor_name] = idx_fp
+            dict_bin_fp[tensor_name] = bin_fp
+            #  通常情况下不同的tensor的长度是一样的
+            self._len = len
+
+        self._input_tensor_name = input_tensor_name
+        self._dict_idx_fp = dict_idx_fp
+        self._dict_bin_fp = dict_bin_fp
+        self._idx_len = idx_len
+
+    def __len__(self):
+        return self._len
+
+    def __getitem__(self, idx):
+        sample = {}
+        for i in range(len(self._idx_len)):
+            if idx >= self._idx_len[i]:
+                idx -= self._idx_len[i]
+            else:
+                break
+        for tensor_name in self._input_tensor_name:
+            sample[tensor_name] = torch.tensor(self._dict_bin_fp[tensor_name][i][
+                self._dict_idx_fp[tensor_name][i][idx, 0]:
+                    self._dict_idx_fp[tensor_name][i][idx, 1]
+            ], dtype=torch.long)
+        # print(sample)
+        return sample
diff --git a/fengshen/data/preprocess.py b/fengshen/data/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb
--- /dev/null
+++ b/fengshen/data/preprocess.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/fengshen/data/sequence_tagging_dataloader/sequence_tagging_collator.py b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b21ff7a0f9152ac16cb434078ac8436dcceeec1a
--- /dev/null
+++ b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_collator.py
@@ -0,0 +1,274 @@
+from dataclasses import dataclass
+from torch.utils.data._utils.collate import default_collate
+
+import copy
+import torch
+import numpy as np
+
+@dataclass
+class CollatorForLinear:
+    args = None
+    tokenizer = None
+    label2id = None
+
+    def __call__(self, samples):
+        cls_token = "[CLS]"
+        sep_token = "[SEP]"
+        pad_token = 0
+        special_tokens_count = 2
+        segment_id = 0
+
+        features=[]
+
+        for (ex_index, example) in enumerate(samples):
+            tokens = copy.deepcopy(example['text_a'])
+
+            label_ids = [self.label2id[x] for x in example['labels']]
+
+            if len(tokens) > self.args.max_seq_length - special_tokens_count:
+                tokens = tokens[: (self.args.max_seq_length - special_tokens_count)]
+                label_ids = label_ids[: (self.args.max_seq_length - special_tokens_count)]
+
+            tokens += [sep_token]
+            label_ids += [self.label2id["O"]]
+            segment_ids = [segment_id] * len(tokens)
+
+            tokens = [cls_token] + tokens
+            label_ids = [self.label2id["O"]] + label_ids
+            segment_ids = [segment_id] + segment_ids
+
+            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+            input_len = len(label_ids)
+            padding_length = self.args.max_seq_length - len(input_ids)
+
+            input_ids += [pad_token] * padding_length
+            input_mask += [0] * padding_length
+            segment_ids += [segment_id] * padding_length
+            label_ids += [pad_token] * padding_length
+
+            assert len(input_ids) == self.args.max_seq_length
+            assert len(input_mask) == self.args.max_seq_length
+            assert len(segment_ids) == self.args.max_seq_length
+            assert len(label_ids) == self.args.max_seq_length
+
+            features.append({
+                    'input_ids':torch.tensor(input_ids),
+                    'attention_mask':torch.tensor(input_mask),
+                    'input_len':torch.tensor(input_len),
+                    'token_type_ids':torch.tensor(segment_ids),
+                    'labels':torch.tensor(label_ids),
+            })
+
+        return default_collate(features)
+
+@dataclass
+class CollatorForCrf:
+    args = None
+    tokenizer = None
+    label2id = None
+
+    def __call__(self, samples):
+        features = []
+        cls_token = "[CLS]"
+        sep_token = "[SEP]"
+        pad_token = 0
+        special_tokens_count = 2
+        segment_id = 0
+
+        for (ex_index, example) in enumerate(samples):
+            tokens = copy.deepcopy(example['text_a'])
+
+            label_ids = [self.label2id[x] for x in example['labels']]
+
+            if len(tokens) > self.args.max_seq_length - special_tokens_count:
+                tokens = tokens[: (self.args.max_seq_length - special_tokens_count)]
+                label_ids = label_ids[: (self.args.max_seq_length - special_tokens_count)]
+
+            tokens += [sep_token]
+            label_ids += [self.label2id["O"]]
+            segment_ids = [segment_id] * len(tokens)
+
+            tokens = [cls_token] + tokens
+            label_ids = [self.label2id["O"]] + label_ids
+            segment_ids = [segment_id] + segment_ids
+
+            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+            input_len = len(label_ids)
+            padding_length = self.args.max_seq_length - len(input_ids)
+
+            input_ids += [pad_token] * padding_length
+            input_mask += [0] * padding_length
+            segment_ids += [segment_id] * padding_length
+            label_ids += [pad_token] * padding_length
+
+            assert len(input_ids) == self.args.max_seq_length
+            assert len(input_mask) == self.args.max_seq_length
+            assert len(segment_ids) == self.args.max_seq_length
+            assert len(label_ids) == self.args.max_seq_length
+
+            features.append({
+                    'input_ids':torch.tensor(input_ids),
+                    'attention_mask':torch.tensor(input_mask),
+                    'input_len':torch.tensor(input_len),
+                    'token_type_ids':torch.tensor(segment_ids),
+                    'labels':torch.tensor(label_ids),
+            })
+        
+        return default_collate(features)
+
+
+@dataclass
+class CollatorForSpan:
+    args = None
+    tokenizer = None
+    label2id = None
+
+    def __call__(self, samples):
+
+        features = []
+        cls_token = "[CLS]"
+        sep_token = "[SEP]"
+        pad_token = 0
+        special_tokens_count = 2
+        max_entities_count = 100
+        segment_id = 0
+
+        for (ex_index, example) in enumerate(samples):
+            subjects = copy.deepcopy(example['subject'])
+            tokens = copy.deepcopy(example['text_a'])
+            start_ids = [0] * len(tokens)
+            end_ids = [0] * len(tokens)
+            subject_ids = []
+            for subject in subjects:
+                label = subject[0]
+                start = subject[1]
+                end = subject[2]
+                start_ids[start] = self.label2id[label]
+                end_ids[end] = self.label2id[label]
+                subject_ids.append([self.label2id[label], start, end])
+            
+            subject_ids+=[[-1,-1,-1]]*(max_entities_count-len(subject_ids))
+
+            if len(tokens) > self.args.max_seq_length - special_tokens_count:
+                tokens = tokens[: (self.args.max_seq_length - special_tokens_count)]
+                start_ids = start_ids[: (self.args.max_seq_length - special_tokens_count)]
+                end_ids = end_ids[: (self.args.max_seq_length - special_tokens_count)]
+
+            tokens += [sep_token]
+            start_ids += [0]
+            end_ids += [0]
+            segment_ids = [segment_id] * len(tokens)
+
+            tokens = [cls_token] + tokens
+            start_ids = [0] + start_ids
+            end_ids = [0] + end_ids
+            segment_ids = [segment_id] + segment_ids
+
+            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+            input_len = len(input_ids)
+            padding_length = self.args.max_seq_length - len(input_ids)
+
+            input_ids += [pad_token] * padding_length
+            input_mask += [0] * padding_length
+            segment_ids += [segment_id] * padding_length
+            start_ids += [0] * padding_length
+            end_ids += [0] * padding_length
+
+            assert len(input_ids) == self.args.max_seq_length
+            assert len(input_mask) == self.args.max_seq_length
+            assert len(segment_ids) == self.args.max_seq_length
+            assert len(start_ids) == self.args.max_seq_length
+            assert len(end_ids) == self.args.max_seq_length
+
+            features.append({
+                    'input_ids': torch.tensor(np.array(input_ids)),
+                    'attention_mask': torch.tensor(np.array(input_mask)),
+                    'token_type_ids': torch.tensor(np.array(segment_ids)),
+                    'start_positions': torch.tensor(np.array(start_ids)),
+                    'end_positions': torch.tensor(np.array(end_ids)),
+                    "subjects": torch.tensor(np.array(subject_ids)),
+                    'input_len': torch.tensor(np.array(input_len)),
+                })
+        
+        return default_collate(features)
+
+
+@dataclass
+class CollatorForBiaffine:
+    args = None
+    tokenizer = None
+    label2id = None
+
+    
+    def __call__(self, samples):
+
+        features = []
+        cls_token = "[CLS]"
+        sep_token = "[SEP]"
+        pad_token = 0
+        special_tokens_count = 2
+        segment_id = 0
+
+        for (ex_index, example) in enumerate(samples):
+            subjects = copy.deepcopy(example['subject'])
+            tokens = copy.deepcopy(example['text_a'])
+
+            span_labels = np.zeros((self.args.max_seq_length,self.args.max_seq_length))
+            span_labels[:] = self.label2id["O"]
+
+            for subject in subjects:
+                label = subject[0]
+                start = subject[1]
+                end = subject[2]
+                if start < self.args.max_seq_length - special_tokens_count and end < self.args.max_seq_length - special_tokens_count:
+                    span_labels[start + 1, end + 1] = self.label2id[label]
+
+            if len(tokens) > self.args.max_seq_length - special_tokens_count:
+                tokens = tokens[: (self.args.max_seq_length - special_tokens_count)]
+
+            tokens += [sep_token]
+            span_labels[len(tokens), :] = self.label2id["O"]
+            span_labels[:, len(tokens)] = self.label2id["O"]
+            segment_ids = [segment_id] * len(tokens)
+
+            tokens = [cls_token] + tokens
+            span_labels[0, :] = self.label2id["O"]
+            span_labels[:, 0] = self.label2id["O"]
+            segment_ids = [segment_id] + segment_ids
+
+            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [0] * len(input_ids)
+            span_mask = np.ones(span_labels.shape)
+            input_len = len(input_ids)
+
+            padding_length = self.args.max_seq_length - len(input_ids)
+
+            input_ids += [pad_token] * padding_length
+            input_mask += [0] * padding_length
+            segment_ids += [segment_id] * padding_length
+            span_labels[input_len:, :] = 0
+            span_labels[:, input_len:] = 0
+            span_mask[input_len:, :] = 0
+            span_mask[:, input_len:] = 0
+            span_mask=np.triu(span_mask,0)
+            span_mask=np.tril(span_mask,10)
+
+            assert len(input_ids) == self.args.max_seq_length
+            assert len(input_mask) == self.args.max_seq_length
+            assert len(segment_ids) == self.args.max_seq_length
+            assert len(span_labels) == self.args.max_seq_length
+            assert len(span_labels[0]) == self.args.max_seq_length
+
+            features.append({
+                    'input_ids': torch.tensor(np.array(input_ids)),
+                    'attention_mask': torch.tensor(np.array(input_mask)),
+                    'token_type_ids': torch.tensor(np.array(segment_ids)),
+                    'span_labels': torch.tensor(np.array(span_labels)),
+                    'span_mask': torch.tensor(np.array(span_mask)),
+                    'input_len': torch.tensor(np.array(input_len)),
+            })
+        
+        return default_collate(features)
\ No newline at end of file
diff --git a/fengshen/data/sequence_tagging_dataloader/sequence_tagging_datasets.py b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e53cbf3d6bd3d2185e66dd0b7fdcfa1b8c44d0
--- /dev/null
+++ b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_datasets.py
@@ -0,0 +1,116 @@
+from torch.utils.data import Dataset
+from fengshen.metric.utils_ner import get_entities
+
+import os
+
+def get_datasets(args):
+    processor = DataProcessor(args.data_dir, args.decode_type)
+
+    train_data = TaskDataset(processor=processor, mode="train")
+    valid_data = TaskDataset(processor=processor, mode="dev")
+    test_data = TaskDataset(processor=processor, mode="dev")
+
+    return {"train":train_data,"validation":valid_data,"test":test_data}
+
+# def get_labels(decode_type):
+#     with open("/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo/labels.txt") as f:
+#         label_list = ["[PAD]", "[START]", "[END]"]
+
+#         if decode_type=="crf" or decode_type=="linear":
+#             for line in f.readlines():
+#                 label_list.append(line.strip())
+#         elif decode_type=="biaffine" or decode_type=="span":
+#             for line in f.readlines():
+#                 tag = line.strip().split("-")
+#                 if len(tag) == 1 and tag[0] not in label_list:
+#                     label_list.append(tag[0])
+#                 elif tag[1] not in label_list:
+#                     label_list.append(tag[1])
+    
+#     label2id={label:id for id,label in enumerate(label_list)}
+#     id2label={id:label for id,label in enumerate(label_list)}
+#     return label2id, id2label
+
+class DataProcessor(object):
+    def __init__(self, data_dir, decode_type) -> None:
+        super().__init__()
+        self.data_dir = data_dir
+        self.decode_type = decode_type
+
+    def get_examples(self, mode):
+        return self._create_examples(self._read_text(os.path.join(self.data_dir, mode + ".all.bmes")), mode)
+
+    @staticmethod
+    def get_labels(args):
+        with open(os.path.join(args.data_dir, "labels.txt")) as f:
+            label_list = ["[PAD]", "[START]", "[END]"]
+
+            if args.decode_type=="crf" or args.decode_type=="linear":
+                for line in f.readlines():
+                    label_list.append(line.strip())
+            elif args.decode_type=="biaffine" or args.decode_type=="span":
+                for line in f.readlines():
+                    tag = line.strip().split("-")
+                    if len(tag) == 1 and tag[0] not in label_list:
+                        label_list.append(tag[0])
+                    elif tag[1] not in label_list:
+                        label_list.append(tag[1])
+
+        label2id = {label: i for i, label in enumerate(label_list)}
+        id2label={id:label for id,label in enumerate(label_list)}
+        return label2id,id2label
+
+    def _create_examples(self, lines, set_type):
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line['words']
+            labels = []
+            for x in line['labels']:
+                if 'M-' in x:
+                    labels.append(x.replace('M-', 'I-'))
+                else:
+                    labels.append(x)
+            subject = get_entities(labels, id2label=None, markup='bioes')
+            examples.append({'guid':guid, 'text_a':text_a, 'labels':labels, 'subject':subject})
+        return examples
+
+    @classmethod
+    def _read_text(self, input_file):
+        lines = []
+        with open(input_file, 'r') as f:
+            words = []
+            labels = []
+            for line in f:
+                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                    if words:
+                        lines.append({"words": words, "labels": labels})
+                        words = []
+                        labels = []
+                else:
+                    splits = line.split()
+                    words.append(splits[0])
+                    if len(splits) > 1:
+                        labels.append(splits[-1].replace("\n", ""))
+                    else:
+                        # Examples could have no label for mode = "test"
+                        labels.append("O")
+            if words:
+                lines.append({"words": words, "labels": labels})
+        return lines
+
+
+class TaskDataset(Dataset):
+    def __init__(self, processor, mode='train'):
+        super().__init__()
+        self.data = self.load_data(processor, mode)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, processor, mode):
+        examples = processor.get_examples(mode)
+        return examples
\ No newline at end of file
diff --git a/fengshen/data/t5_dataloader/t5_datasets.py b/fengshen/data/t5_dataloader/t5_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fd55b8d0be1dd61881b8c782a7eea7a6123efdd
--- /dev/null
+++ b/fengshen/data/t5_dataloader/t5_datasets.py
@@ -0,0 +1,562 @@
+# coding=utf8
+import json
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+from transformers import BertTokenizer, MT5Config, MT5Tokenizer, BatchEncoding
+import torch
+import pytorch_lightning as pl
+import numpy as np
+from itertools import chain
+import sys
+sys.path.append('../../')
+
+
+def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length):
+    """This function is copy of `random_spans_helper <https://github.com/google-research/
+    text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2466>`__ .
+    Training parameters to avoid padding with random_spans_noise_mask.
+    When training a model with random_spans_noise_mask, we would like to set the other
+    training hyperparmeters in a way that avoids padding.
+    This function helps us compute these hyperparameters.
+    We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens,
+    and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens.
+    This function tells us the required number of tokens in the raw example (for split_tokens())
+    as well as the length of the encoded targets. Note that this function assumes
+    the inputs and targets will have EOS appended and includes that in the reported length.
+    Args:
+        inputs_length: an integer - desired length of the tokenized inputs sequence
+        noise_density: a float
+        mean_noise_span_length: a float
+    Returns:
+        tokens_length: length of original text in tokens
+        targets_length: an integer - length in tokens of encoded targets sequence
+    """
+
+    def _tokens_length_to_inputs_length_targets_length(tokens_length):
+        num_noise_tokens = int(round(tokens_length * noise_density))
+        num_nonnoise_tokens = tokens_length - num_noise_tokens
+        num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))
+        # inputs contain all nonnoise tokens, sentinels for all noise spans
+        # and one EOS token.
+        _input_length = num_nonnoise_tokens + num_noise_spans + 1
+        _output_length = num_noise_tokens + num_noise_spans + 1
+        return _input_length, _output_length
+
+    tokens_length = inputs_length
+
+    while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length:
+        tokens_length += 1
+
+    inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(
+        tokens_length)
+
+    # minor hack to get the targets length to be equal to inputs length
+    # which is more likely to have been set to a nice round number.
+    if noise_density == 0.5 and targets_length > inputs_length:
+        tokens_length -= 1
+        targets_length -= 1
+    return tokens_length, targets_length
+
+
+class UnsuperviseT5Dataset(Dataset):
+    '''
+    Dataset Used for T5 unsuprvise pretrain.
+    load_data_type = 0: load raw data from data path and save tokenized data, call function load_data
+    load_data_type = 1: load tokenized data from path, call function load_tokenized_data
+    load_data_type = 2: load tokenized data from memery data, call function load_tokenized_memory_data
+    '''
+
+    def __init__(self, data_path, args, load_data_type=0, data=None):
+        super().__init__()
+
+        if args.tokenizer_type == 't5_tokenizer':
+            if args.new_vocab_path is not None:
+                self.tokenizer = MT5Tokenizer.from_pretrained(args.new_vocab_path)
+            else:
+                self.tokenizer = MT5Tokenizer.from_pretrained(args.pretrained_model_path)
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path)
+        self.noise_density = 0.15
+        self.mean_noise_span_length = 3
+        self.text_column_name = args.text_column_name
+        self.dataset_num_workers = args.dataset_num_workers
+        self.max_seq_length = args.max_seq_length
+        self.remove_columns = args.remove_columns
+        # whether load tokenieze data
+        self.load_data_type = load_data_type
+
+        if self.load_data_type == 0:
+            # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
+            # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
+            # according to `mlm_probability` and `mean_noise_span_length`.
+            # We can also define the label length accordingly.
+            self.expanded_inputs_length, self.targets_length = compute_input_and_target_lengths(
+                inputs_length=self.max_seq_length,
+                noise_density=self.noise_density,
+                mean_noise_span_length=self.mean_noise_span_length,
+            )
+            print('self.expanded_inputs_length, self.targets_length:{},{}'.format(
+                self.expanded_inputs_length, self.targets_length))
+            self.data = self.load_data(data_path)
+        elif self.load_data_type == 1:
+            self.data = self.load_tokenized_data(data_path)
+        else:
+            assert data is not None
+            self.data = self.load_tokenized_memory_data(data)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, data_path):
+        # TODO: large data process
+        from data.fs_datasets import load_dataset
+        samples = load_dataset(
+            # samples = datasets.load_from_disk(data_path)['train']
+            data_path, num_proc=self.dataset_num_workers)['train']
+        # print(samples)
+        tokenized_datasets = samples.map(
+            self.tokenize_function,
+            batched=True,
+            num_proc=self.dataset_num_workers,
+            # load_from_cache_file=not data_args.overwrite_cache,
+        ).map(
+            batched=True,
+            num_proc=self.dataset_num_workers,
+            remove_columns=self.remove_columns)
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co./docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            self.group_texts,
+            batched=True,
+            num_proc=self.dataset_num_workers,
+            # load_from_cache_file=not data_args.overwrite_cache,
+        )
+        return tokenized_datasets
+    '''
+        The function load tokenized data saved from load_data function.
+    '''
+
+    def load_tokenized_data(self, data_path):
+        from data.fs_datasets import load_dataset
+        samples = load_dataset(data_path)['train']
+        return samples
+
+    def load_tokenized_memory_data(self, data):
+        return data
+
+    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
+    def tokenize_function(self, examples):
+        # 这里add_special_tokens=False，避免句子中间出现eos
+        return self.tokenizer(examples[self.text_column_name],
+                              add_special_tokens=False,
+                              return_attention_mask=False)
+
+    # Main data processing function that will concatenate all texts from our dataset
+    # and generate chunks of expanded_inputs_length.
+    def group_texts(self, examples):
+        # Concatenate all texts.
+        concatenated_examples = {
+            k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= self.expanded_inputs_length:
+            total_length = (
+                total_length // self.expanded_inputs_length) * self.expanded_inputs_length
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i: i + self.expanded_inputs_length]
+                for i in range(0, total_length, self.expanded_inputs_length)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+
+class UnsuperviseT5DataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('UnsuperviseT5DataModel')
+        parser.add_argument('--dataset_num_workers', default=8, type=int)
+        parser.add_argument('--dataloader_num_workers', default=4, type=int)
+        parser.add_argument(
+            '--train_data_path', default='wudao_180g_mt5_tokenized', type=str)
+        parser.add_argument('--train_batchsize', default=2, type=int)
+        parser.add_argument('--valid_batchsize', default=2, type=int)
+        parser.add_argument('--train_split_size', default=None, type=float)
+        parser.add_argument('--tokenizer_type', default='t5_tokenizer', choices=['t5_tokenizer', 'bert_tokenizer'])
+        parser.add_argument('--text_column_name', default='text')
+        parser.add_argument('--remove_columns', nargs='+', default=[])
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        if args.train_split_size is not None:
+            from data.fs_datasets import load_dataset
+            data_splits = load_dataset(args.train_data_path, num_proc=args.dataset_num_workers)
+            train_split = data_splits['train']
+            test_split = data_splits['test']
+            print('train:', train_split, '\ntest_data:', test_split)
+            self.train_dataset = UnsuperviseT5Dataset('', args, load_data_type=2, data=train_split)
+            self.test_dataset = UnsuperviseT5Dataset('', args, load_data_type=2, data=test_split)
+        else:
+            self.train_data = UnsuperviseT5Dataset(args.train_data_path, args, load_data_type=1)
+
+        self.config = MT5Config.from_pretrained(args.pretrained_model_path)
+        self.noise_density = 0.15
+        self.mean_noise_span_length = 3
+        self.pad_token_id = self.config.pad_token_id
+        self.decoder_start_token_id = self.config.decoder_start_token_id
+        self.eos_token_id = self.config.eos_token_id
+        self.vocab_size = self.config.vocab_size
+        self.max_seq_length = args.max_seq_length
+        # 因为加载旧的spm里面已经包括了exrta_ids，但是T5Tokenizer会在spm的基础上再增加100个extra_ids,所以需要指定extra_ids=0
+        if args.tokenizer_type == 't5_tokenizer' and args.new_vocab_path is not None:
+            self.tokenizer = MT5Tokenizer.from_pretrained(args.new_vocab_path, extra_ids=0)
+            # 如果是刚开始加载mt5,需要更新vocab_size为提取中英词之后的new_vocab_size
+            self.vocab_size = len(self.tokenizer)
+
+        # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
+        # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
+        # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
+        self.expanded_inputs_length, self.targets_length = compute_input_and_target_lengths(
+            inputs_length=self.max_seq_length,
+            noise_density=self.noise_density,
+            mean_noise_span_length=self.mean_noise_span_length,
+        )
+
+    def train_dataloader(self):
+        from fengshen.data.universal_datamodule.universal_sampler import PretrainingSampler
+        from fengshen.data.universal_datamodule.universal_datamodule import get_consume_samples
+        # 采用自定义的sampler，确保继续训练能正确取到数据
+        consumed_samples = get_consume_samples(self)
+        batch_sampler = PretrainingSampler(
+            total_samples=len(self.train_dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=self.hparams.train_batchsize,
+            data_parallel_rank=self.trainer.global_rank,
+            data_parallel_size=self.trainer.world_size,
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_sampler=batch_sampler,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def val_dataloader(self):
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            self.test_dataset, shuffle=False)
+        return DataLoader(
+            self.test_dataset,
+            sampler=sampler,
+            shuffle=False,
+            batch_size=self.hparams.valid_batchsize,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def predict_dataloader(self):
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            self.test_dataset, shuffle=False)
+        return DataLoader(
+            self.test_data,
+            sampler=sampler,
+            shuffle=False,
+            batch_size=self.hparams.valid_batchsize,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def collate_fn(self, examples):
+        # convert list to dict and tensorize input
+        batch = BatchEncoding(
+            {k: np.array([examples[i][k] for i in range(len(examples))])
+             for k, v in examples[0].items()}
+        )
+
+        input_ids = np.array(batch['input_ids'])
+        batch_size, expanded_input_length = input_ids.shape
+        mask_indices = np.asarray([self.random_spans_noise_mask(
+            expanded_input_length) for i in range(batch_size)])
+        labels_mask = ~mask_indices
+
+        input_ids_sentinel = self.create_sentinel_ids(
+            mask_indices.astype(np.int8))
+        labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8))
+
+        batch["input_ids"] = self.filter_input_ids(
+            input_ids, input_ids_sentinel)
+        batch["labels"] = self.filter_input_ids(input_ids, labels_sentinel)
+
+        if batch["input_ids"].shape[-1] != self.max_seq_length:
+            raise ValueError(
+                f"`input_ids` are incorrectly preprocessed. `input_ids` length is \
+                    {batch['input_ids'].shape[-1]}, but should be {self.targets_length}."
+            )
+
+        if batch["labels"].shape[-1] != self.targets_length:
+            raise ValueError(
+                f"`labels` are incorrectly preprocessed. `labels` length is \
+                    {batch['labels'].shape[-1]}, but should be {self.targets_length}."
+            )
+
+        batch["decoder_input_ids"] = self.shift_tokens_right(
+            batch["labels"], self.pad_token_id, self.decoder_start_token_id
+        )
+
+        for k, v in batch.items():
+            batch[k] = torch.tensor(v)
+            # print(k, batch[k], self.tokenizer.batch_decode(batch[k]), '\n', flush=True)
+        return batch
+
+    def create_sentinel_ids(self, mask_indices):
+        """
+        Sentinel ids creation given the indices that should be masked.
+        The start indices of each mask are replaced by the sentinel ids in increasing
+        order. Consecutive mask indices to be deleted are replaced with `-1`.
+        """
+        start_indices = mask_indices - \
+            np.roll(mask_indices, 1, axis=-1) * mask_indices
+        start_indices[:, 0] = mask_indices[:, 0]
+
+        sentinel_ids = np.where(start_indices != 0, np.cumsum(
+            start_indices, axis=-1), start_indices)
+        sentinel_ids = np.where(
+            sentinel_ids != 0, (self.vocab_size - sentinel_ids), 0)
+        sentinel_ids -= mask_indices - start_indices
+
+        return sentinel_ids
+
+    def filter_input_ids(self, input_ids, sentinel_ids):
+        """
+        Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
+        This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
+        """
+        batch_size = input_ids.shape[0]
+
+        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
+        # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
+        # masked tokens coming after sentinel tokens and should be removed
+        input_ids = input_ids_full[input_ids_full >=
+                                   0].reshape((batch_size, -1))
+        input_ids = np.concatenate(
+            [input_ids, np.full((batch_size, 1), self.eos_token_id, dtype=np.int32)], axis=-1
+        )
+        return input_ids
+
+    # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
+    def shift_tokens_right(self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+        """
+        Shift input ids one token to the right.
+        """
+        shifted_input_ids = np.zeros_like(input_ids)
+        shifted_input_ids[:, 1:] = input_ids[:, :-1]
+        shifted_input_ids[:, 0] = decoder_start_token_id
+
+        shifted_input_ids = np.where(
+            shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+        return shifted_input_ids
+
+    def random_spans_noise_mask(self, length):
+        """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/
+        blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
+        Noise mask consisting of random spans of noise tokens.
+        The number of noise tokens and the number of noise spans and non-noise spans
+        are determined deterministically as follows:
+        num_noise_tokens = round(length * noise_density)
+        num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
+        Spans alternate between non-noise and noise, beginning with non-noise.
+        Subject to the above restrictions, all masks are equally likely.
+        Args:
+            length: an int32 scalar (length of the incoming token sequence)
+            noise_density: a float - approximate density of output mask
+            mean_noise_span_length: a number
+        Returns:
+            a boolean tensor with shape [length]
+        """
+
+        orig_length = length
+
+        num_noise_tokens = int(np.round(length * self.noise_density))
+        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
+        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
+        num_noise_spans = int(
+            np.round(num_noise_tokens / self.mean_noise_span_length))
+
+        # avoid degeneracy by ensuring positive number of noise spans
+        num_noise_spans = max(num_noise_spans, 1)
+        num_nonnoise_tokens = length - num_noise_tokens
+
+        # pick the lengths of the noise spans and the non-noise spans
+        def _random_segmentation(num_items, num_segments):
+            """Partition a sequence of items randomly into non-empty segments.
+            Args:
+                num_items: an integer scalar > 0
+                num_segments: an integer scalar in [1, num_items]
+            Returns:
+                a Tensor with shape [num_segments] containing positive integers that add
+                up to num_items
+            """
+            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
+            np.random.shuffle(mask_indices)
+            first_in_segment = np.pad(mask_indices, [[1, 0]])
+            segment_id = np.cumsum(first_in_segment)
+            # count length of sub segments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
+            return segment_length
+
+        noise_span_lengths = _random_segmentation(
+            num_noise_tokens, num_noise_spans)
+        nonnoise_span_lengths = _random_segmentation(
+            num_nonnoise_tokens, num_noise_spans)
+
+        interleaved_span_lengths = np.reshape(
+            np.stack([nonnoise_span_lengths, noise_span_lengths],
+                     axis=1), [num_noise_spans * 2]
+        )
+        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
+        span_start_indicator = np.zeros((length,), dtype=np.int8)
+        span_start_indicator[span_starts] = True
+        span_num = np.cumsum(span_start_indicator)
+        is_noise = np.equal(span_num % 2, 1)
+
+        return is_noise[:orig_length]
+
+
+class TaskT5Dataset(Dataset):
+    def __init__(self, data_path, args):
+        super().__init__()
+        self.max_length = args.max_seq_length
+        if args.tokenizer_type == 't5_tokenizer':
+            self.tokenizer = MT5Tokenizer.from_pretrained(args.pretrained_model_path)
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path)
+        self.data = self.load_data(data_path)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.encode(self.data[index])
+
+    def load_data(self, data_path):
+        samples = []
+        with open(data_path, 'r', encoding='utf8') as f:
+            lines = f.readlines()
+            for line in tqdm(lines):
+                samples.append(json.loads(line))
+        return samples
+
+    def encode(self, item):
+        if item["textb"] != "":
+            text = item['question'] + '，'.join(item['choice'])+'。' + f"""{item["texta"]}""" + f"""{item["textb"]}"""
+        else:
+            text = f"""{item["question"]}""" + "，".join(item["choice"]) + "。" + f"""{item["texta"]}"""
+        label = item['answer']
+        encode_dict = self.tokenizer.encode_plus(text, max_length=self.max_length, padding='max_length',
+                                                 truncation=True, return_tensors='pt')
+        decode_dict = self.tokenizer.encode_plus(label, max_length=16, padding='max_length',
+                                                 truncation=True)
+
+        answer_token = []
+        max_label_len = 0
+        choice_encode = []  # 用来确定模型生成的最大长度
+        for a in item['choice']:
+            answer_encode = self.tokenizer.encode(a)
+            choice_encode.append(answer_encode)
+            if len(answer_encode) > max_label_len:
+                max_label_len = len(answer_encode)
+            for an in answer_encode:
+                if an not in answer_token:
+                    answer_token.append(an)
+
+        # bad_words_ids = [[i] for i in range(self.tokenizer.vocab_size) if i not in answer_token] #不生成这些token
+
+        # while len(bad_words_ids)<self.tokenizer.vocab_size:
+        #     bad_words_ids.append(bad_words_ids[0])
+
+        # bad_words_ids = [[423],[67],[878]]
+
+        encode_sent = encode_dict['input_ids'].squeeze()
+        attention_mask = encode_dict['attention_mask'].squeeze()
+        target = decode_dict['input_ids']
+        labels = torch.tensor(target)
+        labels[target == self.tokenizer.pad_token_id] = -100
+
+        return {
+            "input_ids": torch.tensor(encode_sent).long(),
+            "attention_mask": torch.tensor(attention_mask).float(),
+            "labels": torch.tensor(target).long(),
+            "force_words_ids": answer_token,
+        }
+
+
+class TaskT5DataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TaskT5DataModel')
+        parser.add_argument('--dataset_num_workers', default=8, type=int)
+        parser.add_argument('--dataloader_num_workers', default=4, type=int)
+        parser.add_argument(
+            '--train_data_path', default='wudao_180g_mt5_tokenized', type=str)
+        parser.add_argument(
+            '--valid_data_path', default='wudao_180g_mt5_tokenized', type=str)
+        parser.add_argument('--train_batchsize', default=2, type=int)
+        parser.add_argument('--valid_batchsize', default=2, type=int)
+        parser.add_argument('--train_split_size', default=None, type=float)
+        parser.add_argument('--tokenizer_type', default='t5_tokenizer', choices=['t5_tokenizer', 'bert_tokenizer'])
+        parser.add_argument('--text_column_name', default='text')
+        parser.add_argument('--remove_columns', nargs='+', default=[])
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.train_dataset = TaskT5Dataset(args.train_data_path, args)
+        self.valid_dataset = TaskT5Dataset(args.valid_data_path, args)
+
+    def train_dataloader(self):
+        from fengshen.data.universal_datamodule.universal_sampler import PretrainingSampler
+        from fengshen.data.universal_datamodule.universal_datamodule import get_consume_samples
+        # 采用自定义的sampler，确保继续训练能正确取到数据
+        consumed_samples = get_consume_samples(self)
+        # batch_sampler = PretrainingRandomSampler(
+        batch_sampler = PretrainingSampler(
+            total_samples=len(self.train_dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=self.hparams.train_batchsize,
+            data_parallel_rank=self.trainer.global_rank,
+            data_parallel_size=self.trainer.world_size,
+        )
+        # epoch=self.trainer.current_epoch
+        # )
+        return DataLoader(
+            self.train_dataset,
+            batch_sampler=batch_sampler,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers
+        )
+
+    def val_dataloader(self):
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            self.valid_dataset, shuffle=False)
+        return DataLoader(
+            self.valid_dataset,
+            sampler=sampler,
+            shuffle=False,
+            batch_size=self.hparams.valid_batchsize,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers
+        )
diff --git a/fengshen/data/t5_dataloader/t5_gen_datasets.py b/fengshen/data/t5_dataloader/t5_gen_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ef6b2a5ec165bac3cf0c3d6a802fe8f1e4fa83d
--- /dev/null
+++ b/fengshen/data/t5_dataloader/t5_gen_datasets.py
@@ -0,0 +1,391 @@
+# -*- encoding: utf-8 -*-
+'''
+@File    :   t5_gen_datasets.py
+@Time    :   2022/10/24 19:29
+@Author  :   He Junqing
+@Version :   1.0
+@Contact :   hejunqing@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+
+from logging import exception
+from transformers import (
+    BertTokenizer,
+    MT5Config,
+    MT5Tokenizer,
+    MT5ForConditionalGeneration,
+)
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+import pytorch_lightning as pl
+import numpy as np
+import sys
+
+sys.path.append("../../")
+
+special_token_dict = {
+    "additional_special_tokens": [
+        "[CTSTART]",
+        "[CTEND]",
+        "[SEP]",
+        "[KNSTART]",
+        "[KNEND]",
+    ]
+}
+
+
+class DialogDataset(Dataset):
+    def __init__(self, data_path, args, data, load_data_type=1) -> None:
+        super().__init__()
+
+        if args.tokenizer_type == "t5_tokenizer":
+            self.tokenizer = MT5Tokenizer.from_pretrained(
+                args.pretrained_model_path)
+            if len(self.tokenizer) == 32596:
+                self.tokenizer.add_special_tokens(special_token_dict)
+                print(
+                    "add special tokens to tokenizer,vocab size:",
+                    len(self.tokenizer)
+                )
+                self.model = MT5ForConditionalGeneration.from_pretrained(
+                    args.pretrained_model_path
+                )
+                self.model.resize_token_embeddings(len(self.tokenizer))
+                self.model.save_pretrained(args.new_vocab_path)
+                self.tokenizer.save_pretrained(
+                    args.new_vocab_path)
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(
+                args.pretrained_model_path)
+
+        self.load_data_type = load_data_type
+        self.data_split = data
+        self.num_workers = args.preprocessing_num_workers
+        self.max_seq_length = args.max_seq_length
+        self.max_knowledge_length = args.max_knowledge_length
+        self.max_target_length = args.max_target_length
+
+        # tokenizer config
+        self.config = MT5Config.from_pretrained(args.pretrained_model_path)
+        self.decoder_start_token_id = self.config.decoder_start_token_id
+        self.eos_token_id = self.config.eos_token_id
+        self.vocab_size = self.config.vocab_size
+        # print(self.tokenizer.decode([2]))
+
+        # load from raw data or hf dataset
+
+        if self.load_data_type == 0:
+            self.data = self.load_data(data_path)
+        elif self.load_data_type == 1:
+            self.data = self.load_packed_data(data_path)
+        else:  # for testing
+            self.data = data_path
+
+    def load_packed_data(self, data_path):
+        from fengshen.data.fs_datasets import load_dataset
+
+        samples = load_dataset(data_path,
+                               num_proc=self.num_workers)[self.data_split]
+        tokenized_samples = samples.map(
+            self.regular_tokenize, batched=False,
+            num_proc=self.num_workers
+        )
+
+        return tokenized_samples
+
+    def load_data(self, data_path):
+        """
+        load data from raw data
+        return untokoenized data
+        """
+        from datasets import load_dataset
+
+        ds = load_dataset("json", data_files=data_path)['train']
+        samples = ds.map(self.regular_tokenize, batched=False, num_proc=self.num_workers
+                         )
+        return samples
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return len(self.data)
+
+    def regular_tokenize(self, sample):
+        # print(len(sample['context']))
+        context_ids = self.tokenizer(
+            sample["context"],
+            add_special_tokens=True,
+            return_attention_mask=False,
+            return_token_type_ids=True,
+        )
+
+        context_types = self.get_token_type(
+            sample["context"], context_ids["token_type_ids"]
+        )
+        # print('context',sample['context'])
+        # print('context_ids',context_ids['input_ids'])
+        knowledge_ids = self.tokenizer.encode(
+            sample["knowledge"], add_special_tokens=False
+        )
+        # print('knowledge_ids',knowledge_ids)
+        if isinstance(knowledge_ids, int):
+            knowledge_ids = [knowledge_ids]
+        target_ids = self.tokenizer.encode(
+            sample["target"],
+            add_special_tokens=False,
+            max_length=self.max_target_length - 1,
+            truncation=True,
+        )
+        # print('target',sample['target'])
+        # print('target_ids',target_ids)
+        # print('decode target',self.tokenizer.decode(target_ids))
+        # truncate
+
+        knowledge_ids = (
+            [self.tokenizer.convert_tokens_to_ids("[KNSTART]")]
+            + knowledge_ids[: self.max_knowledge_length - 2]
+            + [self.tokenizer.convert_tokens_to_ids("[KNEND]")]
+        )
+        l_kn = len(knowledge_ids)
+        knowledge_types = [2] * l_kn
+
+        flatten_context = []
+        for line in context_ids["input_ids"]:
+            flatten_context.extend(line)
+        l_ct = min(len(flatten_context), self.max_seq_length - l_kn - 2)
+        context_ids = (
+            [self.tokenizer.convert_tokens_to_ids("[CTSTART]")]
+            + flatten_context[-l_ct:]
+            + [self.tokenizer.convert_tokens_to_ids("[CTEND]")]
+        )
+
+        context_types = context_types[-l_ct:] + [0]
+        context_types.insert(0, context_types[0])
+        assert len(context_ids) == len(
+            context_types
+        ), "len of context ids and token types unmatch, context:{},ids:{} types:{},len {}:{}".format(
+            sample["context"],
+            context_ids,
+            context_types,
+            len(context_ids),
+            len(context_types),
+        )
+
+        try:
+            target_ids = target_ids + [self.eos_token_id]
+        except exception:
+            print(sample["target"], target_ids, self.eos_token_id)
+
+        tokenized = {}
+        tokenized["input_ids"] = np.array(context_ids + knowledge_ids, dtype=np.int32)
+        tokenized["token_types"] = np.array(
+            context_types + knowledge_types, dtype=np.int32
+        )
+        tokenized["attention_mask"] = np.ones(
+            len(context_types + knowledge_types), dtype=np.int8
+        )
+        tokenized["labels"] = np.array(target_ids, dtype=np.int32)
+
+        return tokenized
+
+    def get_token_type(self, context, tokentypes=None):
+        # token_type fail in tokenizer, all zero
+        context_token_types = []
+        for i, line in enumerate(context):
+            if tokentypes:
+                if i % 2 == 0:
+                    token_type = [0] * len(tokentypes[i])
+                else:
+                    token_type = [1] * len(tokentypes[i])
+            else:
+                if i % 2 == 0:
+                    token_type = [0] * (1 + len(line))
+                else:
+                    token_type = [1] * (1 + len(line))
+
+            context_token_types.extend(token_type)
+
+        return context_token_types
+
+
+class DialogDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group("SuperviseT5DataModel")
+        parser.add_argument("--dataset_num_workers", default=8, type=int)
+        parser.add_argument("--dataloader_num_workers", default=4, type=int)
+        parser.add_argument("--train_data_path", default="dialog_4g_test", type=str)
+        parser.add_argument(
+            "--valid_data_path", default="wudao_180g_mt5_tokenized", type=str
+        )
+        parser.add_argument("--train_batchsize", default=2, type=int)
+        parser.add_argument("--valid_batchsize", default=2, type=int)
+        parser.add_argument("--max_seq_length", default=512, type=int)
+        parser.add_argument("--max_knowledge_length", default=128, type=int)
+        parser.add_argument("--max_target_length", default=128, type=int)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.load_data(args)
+        self.epochs = args.max_epochs
+
+    def load_data(self, args):
+        if args.train_split_size is not None:
+            from fengshen.data.fs_datasets import load_dataset
+
+            data_splits = load_dataset(
+                args.train_data_path, num_proc=args.dataset_num_workers
+            )
+            train_split = data_splits['train']
+            test_split = data_splits['test']
+            print('train:', train_split, '\ntest_data:', test_split)
+            self.train_dataset = DialogDataset(
+                args.train_data_path, args, load_data_type=1, data="train"
+            )
+            self.test_dataset = DialogDataset(
+                args.train_data_path, args, load_data_type=1, data="test"
+            )
+        else:
+            self.train_data = DialogDataset(
+                args.train_data_path, args, load_data_type=1
+            )
+
+        self.config = MT5Config.from_pretrained(args.pretrained_model_path)
+        self.pad_token_id = self.config.pad_token_id
+        self.decoder_start_token_id = self.config.decoder_start_token_id
+        print("bos id:", self.decoder_start_token_id)
+
+    def collate_fn(self, samples):
+        batch = {
+            k: [
+                torch.tensor(samples[i][k], dtype=torch.int64)
+                for i in range(len(samples))
+            ]
+            for k in ["input_ids", "token_types", "attention_mask", "labels"]
+        }
+
+        # print(batch)
+        for k, v in batch.items():
+            if k != "labels":
+                batch[k] = pad_sequence(
+                    v, batch_first=True, padding_value=self.pad_token_id
+                )
+            else:
+                batch[k] = pad_sequence(v, batch_first=True, padding_value=-100)
+        batch["decoder_input_ids"] = torch.tensor(
+            self.shift_tokens_right(
+                batch["labels"], self.pad_token_id, self.decoder_start_token_id
+            ),
+            dtype=torch.long,
+        )
+        return batch
+
+    def shift_tokens_right(
+        self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int
+    ) -> np.ndarray:
+        """
+        Shift input ids one token to the right.
+        """
+        shifted_input_ids = np.zeros_like(input_ids)
+        shifted_input_ids[:, 1:] = input_ids[:, :-1]
+        shifted_input_ids[:, 0] = decoder_start_token_id
+
+        shifted_input_ids = np.where(
+            shifted_input_ids == -100, pad_token_id, shifted_input_ids
+        )
+        return shifted_input_ids
+
+    def train_dataloader(self):
+        from fengshen.data.universal_datamodule.universal_sampler import (
+            PretrainingRandomSampler,
+        )
+        from fengshen.data.universal_datamodule.universal_datamodule import (
+            get_consume_samples,
+        )
+
+        # 采用自定义的sampler，确保继续训练能正确取到数据
+        consumed_samples = get_consume_samples(self)
+        batch_sampler = PretrainingRandomSampler(
+            epoch=self.epochs,
+            total_samples=len(self.train_dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=self.hparams.train_batchsize,
+            data_parallel_rank=self.trainer.global_rank,  # gpu idx
+            data_parallel_size=self.trainer.world_size,  # gpu num
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_sampler=batch_sampler,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def val_dataloader(self):
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            self.test_dataset, shuffle=False
+        )
+        return DataLoader(
+            self.test_dataset,
+            sampler=sampler,
+            shuffle=False,
+            batch_size=self.hparams.valid_batchsize,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+    def predict_dataloader(self):
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            self.test_dataset, shuffle=False
+        )
+        return DataLoader(
+            self.test_dataset,
+            sampler=sampler,
+            shuffle=False,
+            batch_size=self.hparams.valid_batchsize,
+            pin_memory=True,
+            num_workers=self.hparams.dataloader_num_workers,
+            collate_fn=self.collate_fn,
+        )
+
+
+if __name__ == "__main__":
+    # test
+    import argparse
+
+    total_parser = argparse.ArgumentParser("DATASET parser")
+    total_parser.add_argument(
+        "--tokenizer_type",
+        default="t5_tokenizer",
+        choices=["bert_tokenizer", "t5_tokenizer"],
+    )
+    total_parser.add_argument("--preprocessing_num_workers", default="10", type=int)
+    total_parser.add_argument(
+        "--new_vocab_path",
+        default="/cognitive_comp/hejunqing/projects/Dialog_pretrain/randeng_t5_newvocab_784M",
+        type=str,
+    )
+    total_parser.add_argument("--train_split_size", default=0.995, type=int)
+    total_parser.add_argument(
+        "--pretrained_model_path",
+        default="/cognitive_comp/hejunqing/projects/Dialog_pretrain/randeng_t5_newvocab_784M",
+    )
+    total_parser = DialogDataModel.add_data_specific_args(total_parser)
+    args = total_parser.parse_args()
+    dl = DialogDataModel(args)
+
+    for i in range(5):
+        for batch in dl.train_dataloader():
+            print(batch)
+            print(batch["input_ids"])
+            print(batch["token_types"])
+            print(batch["decoder_input_ids"])
+            print(batch["labels"])
+
+    print("test finish")
diff --git a/fengshen/data/taiyi_stable_diffusion_datasets/taiyi_datasets.py b/fengshen/data/taiyi_stable_diffusion_datasets/taiyi_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..73e1071ac27c9839030734fe664abbcfef08d96b
--- /dev/null
+++ b/fengshen/data/taiyi_stable_diffusion_datasets/taiyi_datasets.py
@@ -0,0 +1,173 @@
+from torch.utils.data import Dataset, ConcatDataset
+import os
+from concurrent.futures import ProcessPoolExecutor
+import pandas as pd
+
+
+def add_data_args(parent_args):
+    parser = parent_args.add_argument_group('taiyi stable diffusion data args')
+    # 支持传入多个路径，分别加载
+    parser.add_argument(
+        "--datasets_path", type=str, default=None, required=True, nargs='+',
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--datasets_type", type=str, default=None, required=True, choices=['txt', 'csv', 'fs_datasets'], nargs='+',
+        help="dataset type, txt or csv, same len as datasets_path",
+    )
+    parser.add_argument(
+        "--resolution", type=int, default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", default=False,
+        help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--thres", type=float, default=0.2)
+    return parent_args
+
+
+class TXTDataset(Dataset):
+    # 添加Txt数据集读取，主要是针对Zero23m数据集。
+    def __init__(self,
+                 foloder_name,
+                 thres=0.2):
+        super().__init__()
+        # print(f'Loading folder data from {foloder_name}.')
+        self.image_paths = []
+        '''
+        暂时没有开源这部分文件
+        score_data = pd.read_csv(os.path.join(foloder_name, 'score.csv'))
+        img_path2score = {score_data['image_path'][i]: score_data['score'][i]
+                          for i in range(len(score_data))}
+        '''
+        # print(img_path2score)
+        # 这里都存的是地址，避免初始化时间过多。
+        for each_file in os.listdir(foloder_name):
+            if each_file.endswith('.jpg'):
+                self.image_paths.append(os.path.join(foloder_name, each_file))
+
+        # print('Done loading data. Len of images:', len(self.image_paths))
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        img_path = str(self.image_paths[idx])
+        caption_path = img_path.replace('.jpg', '.txt')  # 图片名称和文本名称一致。
+        with open(caption_path, 'r') as f:
+            caption = f.read()
+        return {'img_path': img_path, 'caption': caption}
+
+
+# NOTE 加速读取数据，直接用原版的，在外部使用并行读取策略。30min->3min
+class CSVDataset(Dataset):
+    def __init__(self,
+                 input_filename,
+                 image_root,
+                 img_key,
+                 caption_key,
+                 thres=0.2):
+        super().__init__()
+        # logging.debug(f'Loading csv data from {input_filename}.')
+        print(f'Loading csv data from {input_filename}.')
+        self.images = []
+        self.captions = []
+
+        if input_filename.endswith('.csv'):
+            # print(f"Load Data from{input_filename}")
+            df = pd.read_csv(input_filename, index_col=0, on_bad_lines='skip')
+            print(f'file {input_filename} datalen {len(df)}')
+            # 这个图片的路径也需要根据数据集的结构稍微做点修改
+            self.images.extend(df[img_key].tolist())
+            self.captions.extend(df[caption_key].tolist())
+        self.image_root = image_root
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self.image_root, str(self.images[idx]))
+        return {'img_path': img_path, 'caption': self.captions[idx]}
+
+
+def if_final_dir(path: str) -> bool:
+    # 如果当前目录有一个文件，那就算是终极目录
+    for f in os.scandir(path):
+        if f.is_file():
+            return True
+    return False
+
+
+def process_pool_read_txt_dataset(args,
+                                  input_root=None,
+                                  thres=0.2):
+    p = ProcessPoolExecutor(max_workers=20)
+    all_datasets = []
+    res = []
+
+    # 遍历该目录下所有的子目录
+    def traversal_files(path: str):
+        list_subfolders_with_paths = [f.path for f in os.scandir(path) if f.is_dir()]
+        for dir_path in list_subfolders_with_paths:
+            if if_final_dir(dir_path):
+                res.append(p.submit(TXTDataset,
+                                    dir_path,
+                                    thres))
+            else:
+                traversal_files(dir_path)
+    traversal_files(input_root)
+    p.shutdown()
+    for future in res:
+        all_datasets.append(future.result())
+    dataset = ConcatDataset(all_datasets)
+    return dataset
+
+
+def process_pool_read_csv_dataset(args,
+                                  input_root,
+                                  thres=0.20):
+    # here input_filename is a directory containing a CSV file
+    all_csvs = os.listdir(os.path.join(input_root, 'release'))
+    image_root = os.path.join(input_root, 'images')
+    # csv_with_score = [each for each in all_csvs if 'score' in each]
+    all_datasets = []
+    res = []
+    p = ProcessPoolExecutor(max_workers=150)
+    for path in all_csvs:
+        each_csv_path = os.path.join(input_root, 'release', path)
+        res.append(p.submit(CSVDataset,
+                            each_csv_path,
+                            image_root,
+                            img_key="name",
+                            caption_key="caption",
+                            thres=thres))
+    p.shutdown()
+    for future in res:
+        all_datasets.append(future.result())
+    dataset = ConcatDataset(all_datasets)
+    return dataset
+
+
+def load_data(args, global_rank=0):
+    assert len(args.datasets_path) == len(args.datasets_type), \
+        "datasets_path num not equal to datasets_type"
+    all_datasets = []
+    for path, type in zip(args.datasets_path, args.datasets_type):
+        if type == 'txt':
+            all_datasets.append(process_pool_read_txt_dataset(
+                args, input_root=path, thres=args.thres))
+        elif type == 'csv':
+            all_datasets.append(process_pool_read_csv_dataset(
+                args, input_root=path, thres=args.thres))
+        elif type == 'fs_datasets':
+            from fengshen.data.fs_datasets import load_dataset
+            all_datasets.append(load_dataset(path, num_proc=args.num_workers,
+                                             thres=args.thres, global_rank=global_rank)['train'])
+        else:
+            raise ValueError('unsupport dataset type: %s' % type)
+        print(f'load datasset {type} {path} len {len(all_datasets[-1])}')
+    return {'train': ConcatDataset(all_datasets)}
diff --git a/fengshen/data/task_dataloader/__init__.py b/fengshen/data/task_dataloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25810ab9ab20ad36f72ba20b31768341e78e2676
--- /dev/null
+++ b/fengshen/data/task_dataloader/__init__.py
@@ -0,0 +1,3 @@
+# coding=utf-8
+from .task_datasets import LCSTSDataModel, LCSTSDataset
+__all__ = ['LCSTSDataModel', 'LCSTSDataset']
diff --git a/fengshen/data/task_dataloader/medicalQADataset.py b/fengshen/data/task_dataloader/medicalQADataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d76ed583c7d150769c81d830293909e1c110485
--- /dev/null
+++ b/fengshen/data/task_dataloader/medicalQADataset.py
@@ -0,0 +1,137 @@
+# coding=utf8
+import os
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+class GPT2QADataset(Dataset):
+    '''
+    Dataset Used for yuyuan medical qa task.
+    Just surpport small datasets, when deal with large datasets it may be slowly.
+    for large datasets please use mmapdatasets(doing)
+    '''
+
+    def __init__(self, data_path, name, args):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
+        self.data_size = os.path.getsize(data_path)/1024/1024/1024
+        self.data_type_name = name
+        self.data = self.load_data(data_path)
+        self.max_seq_length = args.max_seq_length
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.encode(self.data[index])
+
+    def load_data(self, data_path):
+        # 有进度条展示
+        if self.data_size <= 5:
+            with open(data_path, "rt", encoding='utf8') as f:
+                lines = f.readlines()
+            total_num = len(lines)
+            data_gen = lines
+        else:
+            data_gen = open(data_path, "rt", encoding='utf8')
+            total_num = None
+
+        data = []
+        with tqdm(total=total_num, desc=f'{self.data_type_name}处理进度', mininterval=0.3) as bar:
+            for idx, line in enumerate(data_gen):
+                data.append(self.data_parse(line))
+                bar.update()
+
+        if self.data_size > 5:
+            data_gen.close()
+        return data
+
+    def data_parse(self, line):
+        """
+        解析不同格式的数据
+        """
+        dic = eval(line.strip())
+        return dic
+
+    def encode(self, item):
+        """
+        将数据转换成模型训练的输入
+        """
+        inputs_dict = self.tokenizer.encode_plus(item['Question']+item['answer'],
+                                                 max_length=self.max_seq_length, padding='max_length',
+                                                 truncation=True, return_tensors='pt')
+        target = inputs_dict['input_ids']
+        labels = target.clone().detach()
+        labels[target == self.tokenizer.pad_token_id] = -100
+        return {
+            "input_ids": inputs_dict['input_ids'].squeeze(),
+            "attention_mask": inputs_dict['attention_mask'].squeeze(),
+            "labels": labels.squeeze(),
+            "question": item['Question'],
+            "answer": item['answer']
+        }
+
+
+class GPT2QADataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('GPT2QADataModel')
+        parser.add_argument('--data_dir', type=str, required=True)
+        parser.add_argument('--num_workers', default=2, type=int)
+        parser.add_argument('--train_data', default='train.txt', type=str)
+        parser.add_argument('--valid_data', default='valid.txt', type=str)
+        parser.add_argument('--test_data', default='test.txt', type=str)
+        parser.add_argument('--train_batchsize', type=int, required=True)
+        parser.add_argument('--valid_batchsize', type=int, required=True)
+        parser.add_argument('--max_seq_length', default=1024, type=int)
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        if not args.do_eval_only:
+            self.train_data = GPT2QADataset(os.path.join(
+                args.data_dir, args.train_data), '训练集', args)
+            self.valid_data = GPT2QADataset(os.path.join(
+                args.data_dir, args.valid_data), '验证集', args)
+        self.test_data = GPT2QADataset(os.path.join(
+            args.data_dir, args.test_data), '测试集', args)
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_data, shuffle=True,
+            batch_size=self.train_batchsize,
+            pin_memory=False, num_workers=self.args.num_workers)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False,
+                          batch_size=self.valid_batchsize,
+                          pin_memory=False, num_workers=self.args.num_workers)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, shuffle=False,
+                          batch_size=self.valid_batchsize, pin_memory=False,
+                          num_workers=self.args.num_workers)
+
+
+if __name__ == '__main__':
+    import argparse
+    modelfile = '/cognitive_comp/wuziwei/pretrained_model_hf/medical_v2'
+    datafile = '/cognitive_comp/wuziwei/task-data/medical_qa/medical_qa_train.txt'
+    parser = argparse.ArgumentParser(description='hf test', allow_abbrev=False)
+    group = parser.add_argument_group(title='test args')
+    group.add_argument('--pretrained-model-path', type=str, default=modelfile,
+                       help='Number of transformer layers.')
+    group.add_argument('--max-seq-length', type=int, default=1024)
+    args = parser.parse_args()
+
+    testml = GPT2QADataset(datafile, 'medical_qa', args=args)
+
+    print(testml[10])
diff --git a/fengshen/data/task_dataloader/task_datasets.py b/fengshen/data/task_dataloader/task_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8fe7bcf732c61725853df92d9422f207d55f785
--- /dev/null
+++ b/fengshen/data/task_dataloader/task_datasets.py
@@ -0,0 +1,206 @@
+# coding=utf8
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import json
+import torch
+import pytorch_lightning as pl
+import os
+
+
+class AbstractCollator:
+    """
+    collector for summary task
+    """
+
+    def __init__(self, tokenizer, max_enc_length, max_dec_length, prompt):
+        self.tokenizer = tokenizer
+        self.max_enc_length = max_enc_length
+        self.max_dec_length = max_dec_length
+        self.prompt = prompt
+
+    def __call__(self, samples):
+
+        labels = []
+        attn_mask = []
+        # decoder_attn_mask = []
+        source_inputs = []
+        for sample in samples:
+            encode_dict = self.tokenizer.encode_plus(
+                self.prompt + sample['text'],
+                max_length=self.max_enc_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt')
+            decode_dict = self.tokenizer.encode_plus(
+                sample['summary'],
+                max_length=self.max_dec_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt')
+            source_inputs.append(encode_dict['input_ids'].squeeze())
+            labels.append(decode_dict['input_ids'].squeeze())
+            attn_mask.append(encode_dict['attention_mask'].squeeze())
+            # decoder_attn_mask.append(decode_dict['attention_mask'].squeeze())
+        # labels = torch.tensor(decode_dict['input'])
+
+        source_inputs = torch.stack(source_inputs)
+        labels = torch.stack(labels)
+        attn_mask = torch.stack(attn_mask)
+        # decoder_attn_mask = torch.stack(decoder_attn_mask)
+        # decode_input_idxs = shift_tokens_right(labels, self.tokenizer.pad_token_id, self.tokenizer.pad_token_id)
+        end_token_index = torch.where(labels == self.tokenizer.eos_token_id)[1]
+        for idx, end_idx in enumerate(end_token_index):
+            labels[idx][end_idx + 1:] = -100
+
+        return {
+            "input_ids": source_inputs,
+            "attention_mask": attn_mask,
+            "labels": labels,
+            "text": [sample['text'] for sample in samples],
+            "summary": [sample['summary'] for sample in samples]
+        }
+
+
+class LCSTSDataset(Dataset):
+    '''
+    Dataset Used for LCSTS summary task.
+    '''
+
+    def __init__(self, data_path, args):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_path, use_fast=False)
+        self.data = self.load_data(data_path)
+        self.prompt = args.prompt
+        self.max_enc_length = args.max_enc_length
+        self.max_dec_length = args.max_dec_length
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.encode(self.data[index])
+
+    def load_data(self, data_path):
+        with open(data_path, "r", encoding='utf8') as f:
+            lines = f.readlines()
+        samples = []
+        for line in tqdm(lines):
+            obj = json.loads(line)
+            source = obj['text']
+            target = obj['summary']
+            samples.append({
+                "text": source,
+                "summary": target
+            })
+        return samples
+
+    def cal_data(self, data_path):
+        with open(data_path, "r", encoding='utf8') as f:
+            lines = f.readlines()
+        samples = []
+        enc_sizes = []
+        dec_sizes = []
+        for line in tqdm(lines):
+            obj = json.loads(line.strip())
+            source = obj['text']
+            target = obj['summary']
+            enc_input_ids = self.tokenizer.encode(source)
+            target = self.tokenizer.encode(target)
+            enc_sizes.append(len(enc_input_ids))
+            dec_sizes.append(len(target)-1)
+            samples.append({
+                "enc_input_ids": enc_input_ids,
+                "dec_input_ids": target[:-1],
+                "label_ids": target[1:]
+            })
+        max_enc_len = max(enc_sizes)
+        max_dec_len = max(dec_sizes)
+        import numpy as np
+        # mean of len(enc_input_ids): 74.68041911345998
+        # mean of len(dec_input_ids): 14.02265483791283
+        # max of len(enc_input_ids): 132
+        # max of len(dec_input_ids): 31
+        print('mean of len(enc_input_ids):', np.mean(enc_sizes),
+              'mean of len(dec_input_ids):', np.mean(dec_sizes),
+              'max of len(enc_input_ids):', max_enc_len,
+              'max of len(dec_input_ids):', max_dec_len)
+        return samples
+
+    def encode(self, item):
+        encode_dict = self.tokenizer.encode_plus(
+            self.prompt + item['text'],
+            max_length=self.max_enc_length,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt')
+        decode_dict = self.tokenizer.encode_plus(
+            item['summary'],
+            max_length=self.max_dec_length,
+            padding='max_length',
+            truncation=True)
+
+        target = decode_dict['input_ids']
+        # print('encode_dict shape:', encode_dict['input_ids'].shape)
+        labels = torch.tensor(target)
+        labels[target == self.tokenizer.pad_token_id] = -100
+        return {
+            "input_ids": encode_dict['input_ids'].squeeze(),
+            "attention_mask": encode_dict['attention_mask'].squeeze(),
+            "labels": labels.squeeze(),
+            "text": item['text'],
+            "summary": item['summary']
+        }
+
+
+class LCSTSDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('LCSTSDataModel')
+        parser.add_argument(
+            '--data_dir', default='/cognitive_comp/ganruyi/data_datasets_LCSTS_LCSTS/', type=str)
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_data', default='train.jsonl', type=str)
+        parser.add_argument('--valid_data', default='valid.jsonl', type=str)
+        parser.add_argument('--test_data', default='test_public.jsonl', type=str)
+        parser.add_argument('--train_batchsize', default=128, type=int)
+        parser.add_argument('--valid_batchsize', default=128, type=int)
+        parser.add_argument('--max_enc_length', default=128, type=int)
+        parser.add_argument('--max_dec_length', default=30, type=int)
+        parser.add_argument('--prompt', default='summarize:', type=str)
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        if not args.do_eval_only:
+            self.train_data = LCSTSDataset(os.path.join(
+                args.data_dir, args.train_data), args)
+        self.valid_data = LCSTSDataset(os.path.join(
+            args.data_dir, args.valid_data), args)
+        self.test_data = LCSTSDataset(os.path.join(
+            args.data_dir, args.test_data), args)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data,
+                          shuffle=True,
+                          batch_size=self.train_batchsize,
+                          pin_memory=False,
+                          num_workers=self.args.num_workers)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data,
+                          shuffle=False,
+                          batch_size=self.valid_batchsize,
+                          pin_memory=False,
+                          num_workers=self.args.num_workers)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data,
+                          shuffle=False,
+                          batch_size=self.valid_batchsize,
+                          pin_memory=False,
+                          num_workers=self.args.num_workers)
diff --git a/fengshen/data/universal_datamodule/__init__.py b/fengshen/data/universal_datamodule/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68169d26a8424ae877b5c7efc2b7be2e761cd3cb
--- /dev/null
+++ b/fengshen/data/universal_datamodule/__init__.py
@@ -0,0 +1,4 @@
+from .universal_datamodule import UniversalDataModule
+from .universal_sampler import PretrainingSampler, PretrainingRandomSampler
+
+__all__ = ['UniversalDataModule', 'PretrainingSampler', 'PretrainingRandomSampler']
diff --git a/fengshen/data/universal_datamodule/universal_datamodule.py b/fengshen/data/universal_datamodule/universal_datamodule.py
new file mode 100644
index 0000000000000000000000000000000000000000..240557694e97197f08a310351eb6206973107c4d
--- /dev/null
+++ b/fengshen/data/universal_datamodule/universal_datamodule.py
@@ -0,0 +1,165 @@
+from pytorch_lightning import LightningDataModule
+from typing import Optional
+
+from torch.utils.data import DataLoader, DistributedSampler
+
+
+def get_consume_samples(data_model: LightningDataModule) -> int:
+    if hasattr(data_model.trainer.lightning_module, 'consumed_samples'):
+        consumed_samples = data_model.trainer.lightning_module.consumed_samples
+        print('get consumed samples from model: {}'.format(consumed_samples))
+    else:
+        world_size = data_model.trainer.world_size
+        consumed_samples = max(0, data_model.trainer.global_step - 1) * \
+            data_model.hparams.train_batchsize * world_size * data_model.trainer.accumulate_grad_batches
+        print('calculate consumed samples: {}'.format(consumed_samples))
+    return consumed_samples
+
+
+class UniversalDataModule(LightningDataModule):
+    @ staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('Universal DataModule')
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--dataloader_workers', default=2, type=int)
+        parser.add_argument('--train_batchsize', default=16, type=int)
+        parser.add_argument('--val_batchsize', default=16, type=int)
+        parser.add_argument('--test_batchsize', default=16, type=int)
+        parser.add_argument('--datasets_name', type=str, default=None)
+        parser.add_argument('--train_datasets_field', type=str, default='train')
+        parser.add_argument('--val_datasets_field', type=str, default='validation')
+        parser.add_argument('--test_datasets_field', type=str, default='test')
+        parser.add_argument('--train_file', type=str, default=None)
+        parser.add_argument('--val_file', type=str, default=None)
+        parser.add_argument('--test_file', type=str, default=None)
+        parser.add_argument('--raw_file_type', type=str, default='json')
+        parser.add_argument('--sampler_type', type=str,
+                            choices=['single',
+                                     'random'],
+                            default='random')
+        return parent_args
+
+    def __init__(
+        self,
+        tokenizer,
+        collate_fn,
+        args,
+        datasets=None,
+        **kwargs,
+    ):
+        super().__init__()
+        # 如果不传入datasets的名字，则可以在对象外部替换内部的datasets为模型需要的
+        if datasets is not None:
+            self.datasets = datasets
+        elif args.datasets_name is not None:
+            from fengshen.data.fs_datasets import load_dataset
+            print('---------begin to load datasets {}'.format(args.datasets_name))
+            self.datasets = load_dataset(
+                args.datasets_name, num_proc=args.num_workers)
+            print('---------ending load datasets {}'.format(args.datasets_name))
+        else:
+            print('---------begin to load datasets from local file')
+            from datasets import load_dataset
+            self.datasets = load_dataset(args.raw_file_type,
+                                         data_files={
+                                             args.train_datasets_field: args.train_file,
+                                             args.val_datasets_field: args.val_file,
+                                             args.test_datasets_field: args.test_file})
+            print('---------end to load datasets from local file')
+
+        self.tokenizer = tokenizer
+        self.collate_fn = collate_fn
+        self.save_hyperparameters(args)
+
+    def get_custom_sampler(self, ds):
+        from .universal_sampler import PretrainingRandomSampler
+        from .universal_sampler import PretrainingSampler
+        world_size = self.trainer.world_size
+        consumed_samples = get_consume_samples(self)
+        # use the user default sampler
+        if self.hparams.sampler_type == 'random':
+            return PretrainingRandomSampler(
+                total_samples=len(ds),
+                # consumed_samples cal by global steps
+                consumed_samples=consumed_samples,
+                micro_batch_size=self.hparams.train_batchsize,
+                data_parallel_rank=self.trainer.global_rank,
+                data_parallel_size=world_size,
+                epoch=self.trainer.current_epoch,
+            )
+        elif self.hparams.sampler_type == 'single':
+            return PretrainingSampler(
+                total_samples=len(ds),
+                # consumed_samples cal by global steps
+                consumed_samples=consumed_samples,
+                micro_batch_size=self.hparams.train_batchsize,
+                data_parallel_rank=self.trainer.global_rank,
+                data_parallel_size=world_size,
+            )
+        else:
+            raise Exception('Unknown sampler type: {}'.format(self.hparams.sampler_type))
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        return
+
+    def train_dataloader(self):
+        ds = self.datasets[self.hparams.train_datasets_field]
+
+        collate_fn = self.collate_fn
+        if hasattr(ds, 'collate_fn'):
+            collate_fn = ds.collate_fn
+
+        if self.hparams.replace_sampler_ddp is False:
+            return DataLoader(
+                ds,
+                batch_sampler=self.get_custom_sampler(ds),
+                num_workers=self.hparams.dataloader_workers,
+                collate_fn=collate_fn,
+                pin_memory=True,
+            )
+        return DataLoader(
+            ds,
+            batch_size=self.hparams.train_batchsize,
+            num_workers=self.hparams.dataloader_workers,
+            collate_fn=collate_fn,
+            pin_memory=True,
+        )
+
+    def val_dataloader(self):
+        ds = self.datasets[self.hparams.val_datasets_field]
+        collate_fn = self.collate_fn
+        if hasattr(ds, 'collate_fn'):
+            collate_fn = ds.collate_fn
+
+        return DataLoader(
+            ds,
+            batch_size=self.hparams.val_batchsize,
+            shuffle=False,
+            num_workers=self.hparams.dataloader_workers,
+            collate_fn=collate_fn,
+            sampler=DistributedSampler(
+                ds, shuffle=False),
+            pin_memory=True,
+        )
+
+        # return DataLoader(
+        #     ds, shuffle=False, batch_size=self.hparams.val_batchsize, pin_memory=False, collate_fn=collate_fn,
+        # )
+
+    def test_dataloader(self):
+        ds = self.datasets[self.hparams.test_datasets_field]
+
+        collate_fn = self.collate_fn
+        if collate_fn is None and hasattr(ds, 'collater'):
+            collate_fn = ds.collater
+
+        return DataLoader(
+            ds,
+            batch_size=self.hparams.test_batchsize,
+            shuffle=False,
+            num_workers=self.hparams.dataloader_workers,
+            collate_fn=collate_fn,
+            sampler=DistributedSampler(
+                ds, shuffle=False),
+            pin_memory=True,
+        )
diff --git a/fengshen/data/universal_datamodule/universal_sampler.py b/fengshen/data/universal_datamodule/universal_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..86db3016d0f9795f5c8e501da2ff55c6e34e7222
--- /dev/null
+++ b/fengshen/data/universal_datamodule/universal_sampler.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataloaders."""
+
+
+import torch
+
+
+class PretrainingSampler:
+
+    def __init__(self, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size, drop_last=True):
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.drop_last = drop_last
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.consumed_samples < self.total_samples, \
+            'no samples left to consume: {}, {}'.format(self.consumed_samples,
+                                                        self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples // self.micro_batch_times_data_parallel_size
+
+    def get_start_end_idx(self):
+        start_idx = self.data_parallel_rank * self.micro_batch_size
+        end_idx = start_idx + self.micro_batch_size
+        return start_idx, end_idx
+
+    def __iter__(self):
+        batch = []
+        # Last batch will be dropped if drop_last is not set False
+        for idx in range(self.consumed_samples, self.total_samples):
+            batch.append(idx)
+            if len(batch) == self.micro_batch_times_data_parallel_size:
+                start_idx, end_idx = self.get_start_end_idx()
+                yield batch[start_idx:end_idx]
+                batch = []
+
+        # Check the last partial batch and see drop_last is set
+        if len(batch) > 0 and not self.drop_last:
+            start_idx, end_idx = self.get_start_end_idx()
+            yield batch[start_idx:end_idx]
+
+
+class PretrainingRandomSampler:
+
+    def __init__(self, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size, epoch):
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.last_batch_size = \
+            self.total_samples % self.micro_batch_times_data_parallel_size
+        self.epoch = epoch
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples // self.micro_batch_times_data_parallel_size
+
+    def __iter__(self):
+        active_total_samples = self.total_samples - self.last_batch_size
+        current_epoch_samples = self.consumed_samples % active_total_samples
+        assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
+
+        # data sharding and random sampling
+        bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \
+            * self.micro_batch_size
+        bucket_offset = current_epoch_samples // self.data_parallel_size
+        start_idx = self.data_parallel_rank * bucket_size
+
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        random_idx = torch.randperm(bucket_size, generator=g).tolist()
+        idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
+
+        batch = []
+        # Last batch if not complete will be dropped.
+        for idx in idx_range:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_size:
+                self.consumed_samples += self.micro_batch_times_data_parallel_size
+                yield batch
+                batch = []
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/fengshen/examples/DAVAE/generate.py b/fengshen/examples/DAVAE/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d5aebfeb8d68d77bc6c0045ea3c36d789de17ec
--- /dev/null
+++ b/fengshen/examples/DAVAE/generate.py
@@ -0,0 +1,36 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   generate.py
+@Time    :   2022/11/04 19:17
+@Author  :   Liang Yuxin
+@Version :   1.0
+@Contact :   liangyuxin@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+# here put the import lib
+
+import torch
+from fengshen.models.DAVAE.DAVAEModel import DAVAEModel
+from transformers import BertTokenizer,T5Tokenizer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+encoder_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-DAVAE-1.2B-General-Chinese")
+decoder_tokenizer = T5Tokenizer.from_pretrained("IDEA-CCNL/Randeng-DAVAE-1.2B-General-Chinese", eos_token = '<|endoftext|>', pad_token = '<pad>',extra_ids=0)
+decoder_tokenizer.add_special_tokens({'bos_token':'<bos>'})
+vae_model = DAVAEModel.from_pretrained("IDEA-CCNL/Randeng-DAVAE-1.2B-General-Chinese").to(device)
+input_texts = [
+    "针对电力系统中的混沌振荡对整个互联电网的危害问题,提出了一种基于非线性光滑函数的滑模控制方法.",
+    "超市面积不算大.挺方便附近的居民购买的. 生活用品也比较齐全.价格适用中.",
+]
+output_texts = vae_model.simulate_batch(encoder_tokenizer,decoder_tokenizer,input_texts)
+print(output_texts)
diff --git a/fengshen/examples/FastDemo/README.md b/fengshen/examples/FastDemo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..132519b95da3fd35f4c4fb6aae5d8c44faad3a42
--- /dev/null
+++ b/fengshen/examples/FastDemo/README.md
@@ -0,0 +1,105 @@
+# 「streamlit」快速搭建你的算法demo
+在搭建demo之前，首先得做好这些准备工作：
+- 模型训练完毕
+- 模型的入参确定
+- 安装streamlit库，`pip install streamlit` 就可以安装。
+
+streamlit脚本的启动方式是 `streamlit run demo.py`，很简单就启动了一个demo页面，页面会随着脚本代码的改变实时刷新的。所以在没有经验的时候，可以创建一个demo.py的文件，照着下面的教程一步一步添加代码，看页面的展示情况。下面开始上干货，具体细节在代码注释中有说明！
+
+### 第一步 导包
+```python 
+import streamlit as st
+# 其他包更具你的需要导入
+```
+[streamlit](https://streamlit.io)是一个用于构建机器学习、深度学习、数据可视化demo的python框架。它不需要你有web开发的经验，会写python就可以高效的开发你的demo。
+
+### 第二步 页面导航信息以及布局配置
+
+```python 
+st.set_page_config(
+     page_title="余元医疗问答", # 页面标签标题
+     page_icon=":shark:", # 页面标签图标
+     layout="wide", # 页面的布局
+     initial_sidebar_state="expanded", # 左侧的sidebar的布局方式
+     # 配置菜单按钮的信息
+     menu_items={
+         'Get Help': 'https://www.extremelycoolapp.com/help',
+         'Report a bug': "https://www.extremelycoolapp.com/bug",
+         'About': "# This is a header. This is an *extremely* cool app!"
+     }
+ )
+```
+这一步可以省略，如果想让app更加个性化，可以添加这些设置。
+
+### 第三步 设置demo标题
+```python 
+st.title('Demo for MedicalQA') 
+```
+streamlit的每一个小组件对应于页面都有一个默认的样式展示。
+
+### 第四步 配置demo的参数
+
+```python 
+# 此处是用的sidebar，侧边栏作为参数配置模块
+st.sidebar.header("参数配置")
+# 这里是在sidebar里面创建了表单，每个表单一定有一个标题和提交按钮
+sbform = st.sidebar.form("固定参数设置")
+# slider是滑动条组建，可以配置数值型参数
+n_sample = sbform.slider("设置返回条数",min_value=1,max_value=10,value=3)
+text_length = sbform.slider('生成长度:',min_value=32,max_value=512,value=64,step=32)
+text_level = sbform.slider('文本多样性:',min_value=0.1,max_value=1.0,value=0.9,step=0.1)
+# number_input也可以配置数值型参数
+model_id = sbform.number_input('选择模型号:',min_value=0,max_value=13,value=13,step=1)
+# selectbox选择组建，只能选择配置的选项
+trans = sbform.selectbox('选择翻译内核',['百度通用','医疗生物'])
+# 提交表单的配置，这些参数的赋值才生效
+sbform.form_submit_button("提交配置")
+
+# 这里是页面中的参数配置，也是demo的主体之一
+form = st.form("参数设置")
+# 本demo是qa demo，所以要录入用户的文本输入，text_input组建可以实现
+input_text = form.text_input('请输入你的问题:',value='',placeholder='例如：糖尿病的症状有哪些？')
+form.form_submit_button("提交")
+```
+以上就把demo的参数基本配置完成了。
+
+### 第五步 模型预测
+```python 
+# 定义一个前向预测的方法
+# @st.cache(suppress_st_warning=True)
+def generate_qa(input_text,n_sample,model_id='7',length=64,translator='baidu',level=0.7):
+    # 这里我们是把模型用fastapi搭建了一个api服务
+    URL = 'http://192.168.190.63:6605/qa'
+    data = {
+            "text":input_text,"n_sample":n_sample,
+            "model_id":model_id,"length":length,
+            'translator':translator,'level':level
+            }
+    r = requests.get(URL,params=data)
+    return r.text
+# 模型预测结果
+results = generate_qa(input_text,n_sample,model_id=str(model_id),
+                    translator=translator,length=text_length,level=text_level)
+```
+这里说明一下，由于demo展示机器没有GPU，所以模型部署采用的是Fastapi部署在后台的。如果demo展示的机器可以直接部署模型，这里可以直接把模型预测的方法写在这里，不需要另外部署模型，再用api的方式调用。这样做有一个值得注意的地方，因为streamlit的代码每一次运行，都是从头到尾执行一遍，就导致模型可能会重复加载，所以这里需要用到st.cache组建，当内容没有更新的时候，会把这一步的结果缓存，而不会重新执行。保证了效率不会因此而下降。
+
+### 第六步 结果展示
+```python 
+with st.spinner('老夫正在思考中🤔...'):
+    if input_text:
+        results = generate_qa(input_text,n_sample,model_id=str(model_id),
+                        translator=translator,length=text_length,level=text_level)
+        for idx,item in enumerate(eval(results),start=1):
+            st.markdown(f"""
+            **候选回答「{idx}」:**\n
+            """)
+            st.info('中文：%s'%item['fy_next_sentence'])
+            st.info('英文：%s'%item['next_sentence'])
+```
+streamlit对不同格式的内容展示，有丰富的组建，对于文本可以用`st.markdown`组建以及`st.text`和`st.write`展示。更多组建和功能可以参考官方文档：https://docs.streamlit.io
+
+至此，一个完整的demo展示就完成了。效果图如下：
+
+![](./image/demo.png)
+
+完整的代码可以参考：`Fengshenbang-LM/fengshen/examples/FastDemo/YuyuanQA.py`
diff --git a/fengshen/examples/FastDemo/YuyuanQA.py b/fengshen/examples/FastDemo/YuyuanQA.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed2d19bc61e0735f3868e1a30a532bd19fbb4b0
--- /dev/null
+++ b/fengshen/examples/FastDemo/YuyuanQA.py
@@ -0,0 +1,71 @@
+import requests
+import langid
+import streamlit as st
+from translate import baiduTranslatorMedical
+from translate import baiduTranslator
+
+langid.set_languages(['en', 'zh'])
+lang_dic = {'zh': 'en', 'en': 'zh'}
+
+st.set_page_config(
+    page_title="余元医疗问答",
+    page_icon=":shark:",
+    #  layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        'Get Help': 'https://www.extremelycoolapp.com/help',
+        'Report a bug': "https://www.extremelycoolapp.com/bug",
+        'About': "# This is a header. This is an *extremely* cool app!"
+    }
+)
+st.title('Demo for MedicalQA')
+
+
+st.sidebar.header("参数配置")
+sbform = st.sidebar.form("固定参数设置")
+n_sample = sbform.slider("设置返回条数", min_value=1, max_value=10, value=3)
+text_length = sbform.slider('生成长度:', min_value=32, max_value=512, value=64, step=32)
+text_level = sbform.slider('文本多样性:', min_value=0.1, max_value=1.0, value=0.9, step=0.1)
+model_id = sbform.number_input('选择模型号:', min_value=0, max_value=13, value=13, step=1)
+trans = sbform.selectbox('选择翻译内核', ['百度通用', '医疗生物'])
+sbform.form_submit_button("配置")
+
+
+form = st.form("参数设置")
+input_text = form.text_input('请输入你的问题:', value='', placeholder='例如：糖尿病的症状有哪些？')
+if trans == '百度通用':
+    translator = 'baidu_common'
+else:
+    translator = 'baidu'
+if input_text:
+    lang = langid.classify(input_text)[0]
+    if translator == 'baidu':
+        st.write('**你的问题是:**', baiduTranslatorMedical(input_text, src=lang, dest=lang_dic[lang]).text)
+    else:
+        st.write('**你的问题是:**', baiduTranslator(input_text, src=lang, dest=lang_dic[lang]).text)
+
+form.form_submit_button("提交")
+
+# @st.cache(suppress_st_warning=True)
+
+
+def generate_qa(input_text, n_sample, model_id='7', length=64, translator='baidu', level=0.7):
+    # st.write('调用了generate函数')
+    URL = 'http://192.168.190.63:6605/qa'
+    data = {"text": input_text, "n_sample": n_sample, "model_id": model_id,
+            "length": length, 'translator': translator, 'level': level}
+    r = requests.get(URL, params=data)
+    return r.text
+# my_bar = st.progress(80)
+
+
+with st.spinner('老夫正在思考中🤔...'):
+    if input_text:
+        results = generate_qa(input_text, n_sample, model_id=str(model_id),
+                              translator=translator, length=text_length, level=text_level)
+        for idx, item in enumerate(eval(results), start=1):
+            st.markdown(f"""
+            **候选回答「{idx}」:**\n
+            """)
+            st.info('中文：%s' % item['fy_next_sentence'])
+            st.info('英文：%s' % item['next_sentence'])
diff --git a/fengshen/examples/FastDemo/image/demo.png b/fengshen/examples/FastDemo/image/demo.png
new file mode 100644
index 0000000000000000000000000000000000000000..3eee22e26192861429863058de716e457fc8fc57
Binary files /dev/null and b/fengshen/examples/FastDemo/image/demo.png differ
diff --git a/fengshen/examples/GAVAE/generate.py b/fengshen/examples/GAVAE/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e6a5693faaf18932d8a8648bb999546619c4cdf
--- /dev/null
+++ b/fengshen/examples/GAVAE/generate.py
@@ -0,0 +1,23 @@
+import torch
+from transformers import BertTokenizer,T5Tokenizer
+from fengshen.models.GAVAE.GAVAEModel import GAVAEModel
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+encoder_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-GAVAE-1.2B-Augmentation-Chinese")
+decoder_tokenizer = T5Tokenizer.from_pretrained("IDEA-CCNL/Randeng-GAVAE-1.2B-Augmentation-Chinese", eos_token = '<|endoftext|>', pad_token = '<pad>',extra_ids=0)
+decoder_tokenizer.add_special_tokens({'bos_token':'<bos>'})
+input_texts = [
+    "非常好的一个博物馆，是我所有去过的博物馆里感觉最正规的一家，凭有效证件可以入馆，可以自助免费存小件物品，讲解员和馆内外的工作人员也非常认真，其他的服务人员也很热情，非常好的！馆内的藏品也让人非常震撼！希望继续保持～", 
+    "这是我来长沙最最期待的一定要去的地方，总算今天特地去瞻仰千古遗容了，开车到门口大屏幕显示着门票已发完的字样，心里一惊以为今天是白来了。但进了停车场才知道凭停车卡和有效身份证里面也能领，停车还不花钱，真好。", 
+    "地方很大 很气派~~可以逛很久~~~去的时候是免费的~不过要安检~~~里面的马王堆~幸追夫人~还是很不错的~~~~去的时候有一个吴越文化特别展~~~东西也很多~~~~~很好看",
+    "我们到达的时候是下午3点，门票已经发完了。当时正焦虑的不知道怎么办才好，门卫大哥给我们俩补办了门票，这才得以入馆。非常感谢！绝对不虚此行！相当震撼的展览！原来古人也化妆，还有假发。记忆最深的是那个藕汤。可惜真颜已不得见。", 
+    "去过三次，个人认为这是长沙最值得去的地方，博物馆的重点就是辛追，遗憾的是，每次去我都会感到悲哀，虽然我三次去的时候都要门票，但是每次看到辛追，都觉得现代的人类不应该挖她出来，除了第一次我觉得辛追像刚死去一样，后来两次我觉得太惨不忍睹了。建议大家要去就早去，以后肯定越来越腐烂", 
+    "上大学时候去的，当时学生证是半价25，后来凭有效证件就不要钱了。非常喜欢的一家博物馆，里面可看的东西很多，当然最吸引我的就是那个辛追夫人和“素纱单衣”，果然不是盖的~里面的讲解员大部分都是师大学历史类的，非常专业和有耐心。虽然不在长沙了，不过对那里还是很有感情的，赞~~~", 
+    "这两年也有很多机会去博物馆。。。不过还是想说湖南省博物馆是非常有特色的。。。应该说整个展览分成两个部分吧。。。一个部分是马王堆的主体展。。。另一个就是湖南的一些考古发现。。。其实来省博大部分的游客还是冲着马王堆来的吧。。。博物馆也很有心的为每一批游客安排了讲解员。。。从马王堆的发现到马王堆出土文物的介绍再到最后棺木和辛追的介绍。。。真是上了一节很生动的历史课。",
+    "网上订票去的，还是很顺利的就进去了，里面挺清净的，外围的环境也不错，还有鸽子可以喂。那天不是很闹，兜了一圈感觉还是很顺畅的，老娘娘和金缕玉衣挺震撼的。到此一游还是挺需要的",
+]
+gavae_model = GAVAEModel.from_pretrained("IDEA-CCNL/Randeng-GAVAE-1.2B-Augmentation-Chinese").to(device)
+gavae_model.train_gan(encoder_tokenizer,decoder_tokenizer,input_texts)
+# n:输出样本数量
+texts = gavae_model.generate(n=5)
+print(texts)
diff --git a/fengshen/examples/PPVAE/generate.py b/fengshen/examples/PPVAE/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bbd369768cf1b903b4edf642836d28dc5a09274
--- /dev/null
+++ b/fengshen/examples/PPVAE/generate.py
@@ -0,0 +1,24 @@
+import torch
+from transformers import BertTokenizer,T5Tokenizer
+from fengshen.models.PPVAE.pluginVAE import PPVAEModel
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+encoder_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-PPVAE-1.2B-Augmentation-Chinese")
+decoder_tokenizer = T5Tokenizer.from_pretrained("IDEA-CCNL/Randeng-PPVAE-1.2B-Augmentation-Chinese", eos_token = '<|endoftext|>', pad_token = '<pad>',extra_ids=0)
+decoder_tokenizer.add_special_tokens({'bos_token':'<bos>'})
+ppvae_model = PPVAEModel.from_pretrained("IDEA-CCNL/Randeng-PPVAE-1.2B-Augmentation-Chinese").to(device)
+input_texts = [
+    "非常好的一个博物馆，是我所有去过的博物馆里感觉最正规的一家，凭有效证件可以入馆，可以自助免费存小件物品，讲解员和馆内外的工作人员也非常认真，其他的服务人员也很热情，非常好的！馆内的藏品也让人非常震撼！希望继续保持～", 
+    "这是我来长沙最最期待的一定要去的地方，总算今天特地去瞻仰千古遗容了，开车到门口大屏幕显示着门票已发完的字样，心里一惊以为今天是白来了。但进了停车场才知道凭停车卡和有效身份证里面也能领，停车还不花钱，真好。", 
+    "地方很大 很气派~~可以逛很久~~~去的时候是免费的~不过要安检~~~里面的马王堆~幸追夫人~还是很不错的~~~~去的时候有一个吴越文化特别展~~~东西也很多~~~~~很好看",
+    "我们到达的时候是下午3点，门票已经发完了。当时正焦虑的不知道怎么办才好，门卫大哥给我们俩补办了门票，这才得以入馆。非常感谢！绝对不虚此行！相当震撼的展览！原来古人也化妆，还有假发。记忆最深的是那个藕汤。可惜真颜已不得见。", 
+    "去过三次，个人认为这是长沙最值得去的地方，博物馆的重点就是辛追，遗憾的是，每次去我都会感到悲哀，虽然我三次去的时候都要门票，但是每次看到辛追，都觉得现代的人类不应该挖她出来，除了第一次我觉得辛追像刚死去一样，后来两次我觉得太惨不忍睹了。建议大家要去就早去，以后肯定越来越腐烂", 
+    "上大学时候去的，当时学生证是半价25，后来凭有效证件就不要钱了。非常喜欢的一家博物馆，里面可看的东西很多，当然最吸引我的就是那个辛追夫人和“素纱单衣”，果然不是盖的~里面的讲解员大部分都是师大学历史类的，非常专业和有耐心。虽然不在长沙了，不过对那里还是很有感情的，赞~~~", 
+    "这两年也有很多机会去博物馆。。。不过还是想说湖南省博物馆是非常有特色的。。。应该说整个展览分成两个部分吧。。。一个部分是马王堆的主体展。。。另一个就是湖南的一些考古发现。。。其实来省博大部分的游客还是冲着马王堆来的吧。。。博物馆也很有心的为每一批游客安排了讲解员。。。从马王堆的发现到马王堆出土文物的介绍再到最后棺木和辛追的介绍。。。真是上了一节很生动的历史课。",
+    "网上订票去的，还是很顺利的就进去了，里面挺清净的，外围的环境也不错，还有鸽子可以喂。那天不是很闹，兜了一圈感觉还是很顺畅的，老娘娘和金缕玉衣挺震撼的。到此一游还是挺需要的",
+]
+
+ppvae_model.train_plugin(encoder_tokenizer,decoder_tokenizer,input_texts,negative_samples=None)
+# n:输出样本数量
+texts = ppvae_model.generate(n=5)
+print(texts)
\ No newline at end of file
diff --git a/fengshen/examples/classification/demo_classification_afqmc_erlangshen_offload.sh b/fengshen/examples/classification/demo_classification_afqmc_erlangshen_offload.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f5ff555aa60e3cebd544b92a18443eb7505f8ae8
--- /dev/null
+++ b/fengshen/examples/classification/demo_classification_afqmc_erlangshen_offload.sh
@@ -0,0 +1,103 @@
+MODEL_NAME="IDEA-CCNL/Erlangshen-MegatronBert-1.3B"
+
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+BATCH_SIZE=1
+VAL_BATCH_SIZE=1
+ZERO_STAGE=3
+config_json="./ds_config.json"
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 1000,
+  "gradient_clipping": 1,
+  "zero_optimization": {
+        "stage": ${ZERO_STAGE},
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9
+    },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+DATA_ARGS="\
+        --dataset_name IDEA-CCNL/AFQMC \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-5 \
+        --weight_decay 1e-1 \
+        --warmup_ratio 0.01 \
+        --num_labels 2 \
+        --model_type huggingface-auto \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 0 \
+        --save_weights_only True \
+        --dirpath . \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 67 \
+        --gpus 1 \
+        --num_nodes 1 \
+        --strategy deepspeed_stage_${ZERO_STAGE}_offload \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --precision 16 \
+        --default_root_dir . \
+        "
+
+options=" \
+        --pretrained_model_path $MODEL_NAME \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+ 
+python3 finetune_classification.py $options
+
diff --git a/fengshen/examples/classification/demo_classification_afqmc_roberta.sh b/fengshen/examples/classification/demo_classification_afqmc_roberta.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bad55f2de72f66f02b583d9b191802c55cfe0a4b
--- /dev/null
+++ b/fengshen/examples/classification/demo_classification_afqmc_roberta.sh
@@ -0,0 +1,62 @@
+MODEL_NAME="IDEA-CCNL/Erlangshen-Roberta-110M-NLI"
+
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+BATCH_SIZE=1
+VAL_BATCH_SIZE=1
+
+DATA_ARGS="\
+        --dataset_name IDEA-CCNL/AFQMC \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-5 \
+        --weight_decay 1e-2 \
+        --warmup_ratio 0.01 \
+        --num_labels 2 \
+        --model_type huggingface-auto \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 0 \
+        --save_weights_only True \
+        --dirpath . \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 67 \
+        --gpus 1 \
+        --num_nodes 1 \
+        --strategy ddp \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --precision 16 \
+        --default_root_dir . \
+        "
+
+options=" \
+        --pretrained_model_path $MODEL_NAME \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+ 
+python3 finetune_classification.py $options
+
diff --git a/fengshen/examples/classification/demo_classification_afqmc_roberta_deepspeed.sh b/fengshen/examples/classification/demo_classification_afqmc_roberta_deepspeed.sh
new file mode 100644
index 0000000000000000000000000000000000000000..48b003940a960454912a62731e5aec3b9046a6df
--- /dev/null
+++ b/fengshen/examples/classification/demo_classification_afqmc_roberta_deepspeed.sh
@@ -0,0 +1,90 @@
+MODEL_NAME="IDEA-CCNL/Erlangshen-Roberta-110M-NLI"
+
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+BATCH_SIZE=32
+VAL_BATCH_SIZE=32
+ZERO_STAGE=1
+config_json="./ds_config.json"
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 1000,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+DATA_ARGS="\
+        --dataset_name IDEA-CCNL/AFQMC \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-5 \
+        --weight_decay 1e-2 \
+        --warmup_ratio 0.01 \
+        --num_labels 2 \
+        --model_type huggingface-auto \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 0 \
+        --save_weights_only True \
+        --dirpath . \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 67 \
+        --gpus 1 \
+        --num_nodes 1 \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --precision 16 \
+        --default_root_dir . \
+        "
+
+options=" \
+        --pretrained_model_path $MODEL_NAME \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+ 
+python3 finetune_classification.py $options
+
diff --git a/fengshen/examples/classification/finetune_classification.py b/fengshen/examples/classification/finetune_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e643f2fcf560b6c817d22946ad4a6610b647e13
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification.py
@@ -0,0 +1,389 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from fengshen.models.zen1 import ZenModel
+from dataclasses import dataclass
+from fengshen.models.megatron_t5 import T5EncoderModel
+from fengshen.models.roformer import RoFormerModel
+from fengshen.models.longformer import LongformerModel
+# from fengshen.models.cocolm.modeling_cocolm import COCOLMForSequenceClassification
+import numpy as np
+import os
+from tqdm import tqdm
+import json
+import torch
+import pytorch_lightning as pl
+import argparse
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data._utils.collate import default_collate
+from transformers import (
+    BertModel,
+    BertConfig,
+    MegatronBertModel,
+    MegatronBertConfig,
+    AutoModel,
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+)
+# os.environ["CUDA_VISIBLE_DEVICES"] = '6'
+
+
+model_dict = {'huggingface-bert': BertModel,
+              'fengshen-roformer': RoFormerModel,
+              'huggingface-megatron_bert': MegatronBertModel,
+              'fengshen-megatron_t5': T5EncoderModel,
+              'fengshen-longformer': LongformerModel,
+              # 'fengshen-zen1': ZenModel,
+              'huggingface-auto': AutoModelForSequenceClassification,
+              }
+
+
+class TaskDataset(Dataset):
+    def __init__(self, data_path, args, label2id):
+        super().__init__()
+        self.args = args
+        self.label2id = label2id
+        self.max_length = args.max_length
+        self.data = self.load_data(data_path, args)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, data_path, args):
+        with open(data_path, 'r', encoding='utf8') as f:
+            lines = f.readlines()
+            samples = []
+            for line in tqdm(lines):
+                data = json.loads(line)
+                text_id = int(data[args.id_name]
+                              ) if args.id_name in data.keys() else 0
+                texta = data[args.texta_name] if args.texta_name in data.keys(
+                ) else ''
+                textb = data[args.textb_name] if args.textb_name in data.keys(
+                ) else ''
+                labels = self.label2id[data[args.label_name]
+                                       ] if args.label_name in data.keys() else 0
+                samples.append({args.texta_name: texta, args.textb_name: textb,
+                                args.label_name: labels, 'id': text_id})
+        return samples
+
+
+@dataclass
+class TaskCollator:
+    args = None
+    tokenizer = None
+
+    def __call__(self, samples):
+        sample_list = []
+        for item in samples:
+            if item[self.args.texta_name] != '' and item[self.args.textb_name] != '':
+                if self.args.model_type != 'fengshen-roformer':
+                    encode_dict = self.tokenizer.encode_plus(
+                        [item[self.args.texta_name], item[self.args.textb_name]],
+                        max_length=self.args.max_length,
+                        padding='max_length',
+                        truncation='longest_first')
+                else:
+                    encode_dict = self.tokenizer.encode_plus(
+                        [item[self.args.texta_name] +
+                            self.tokenizer.eos_token+item[self.args.textb_name]],
+                        max_length=self.args.max_length,
+                        padding='max_length',
+                        truncation='longest_first')
+            else:
+                encode_dict = self.tokenizer.encode_plus(
+                    item[self.args.texta_name],
+                    max_length=self.args.max_length,
+                    padding='max_length',
+                    truncation='longest_first')
+            sample = {}
+            for k, v in encode_dict.items():
+                sample[k] = torch.tensor(v)
+            sample['labels'] = torch.tensor(item[self.args.label_name]).long()
+            sample['id'] = item['id']
+            sample_list.append(sample)
+        return default_collate(sample_list)
+
+
+class TaskDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--data_dir', default='./data', type=str)
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_data', default='train.json', type=str)
+        parser.add_argument('--valid_data', default='dev.json', type=str)
+        parser.add_argument('--test_data', default='test.json', type=str)
+        parser.add_argument('--train_batchsize', default=16, type=int)
+        parser.add_argument('--valid_batchsize', default=32, type=int)
+        parser.add_argument('--max_length', default=128, type=int)
+
+        parser.add_argument('--texta_name', default='text', type=str)
+        parser.add_argument('--textb_name', default='sentence2', type=str)
+        parser.add_argument('--label_name', default='label', type=str)
+        parser.add_argument('--id_name', default='id', type=str)
+
+        parser.add_argument('--dataset_name', default=None, type=str)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_path)
+        self.collator = TaskCollator()
+        self.collator.args = args
+        self.collator.tokenizer = self.tokenizer
+        if args.dataset_name is None:
+            self.label2id, self.id2label = self.load_schema(os.path.join(
+                args.data_dir, args.train_data), args)
+            self.train_data = TaskDataset(os.path.join(
+                args.data_dir, args.train_data), args, self.label2id)
+            self.valid_data = TaskDataset(os.path.join(
+                args.data_dir, args.valid_data), args, self.label2id)
+            self.test_data = TaskDataset(os.path.join(
+                args.data_dir, args.test_data), args, self.label2id)
+        else:
+            import datasets
+            ds = datasets.load_dataset(args.dataset_name)
+            self.train_data = ds['train']
+            self.valid_data = ds['validation']
+            self.test_data = ds['test']
+        self.save_hyperparameters(args)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def load_schema(self, data_path, args):
+        with open(data_path, 'r', encoding='utf8') as f:
+            lines = f.readlines()
+            label_list = []
+            for line in tqdm(lines):
+                data = json.loads(line)
+                labels = data[args.label_name] if args.label_name in data.keys(
+                ) else 0
+                if labels not in label_list:
+                    label_list.append(labels)
+
+        label2id, id2label = {}, {}
+        for i, k in enumerate(label_list):
+            label2id[k] = i
+            id2label[i] = k
+        return label2id, id2label
+
+
+class taskModel(torch.nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        print('args mode type:', args.model_type)
+        self.bert_encoder = model_dict[args.model_type].from_pretrained(
+            args.pretrained_model_path)
+        self.config = self.bert_encoder.config
+        self.cls_layer = torch.nn.Linear(
+            in_features=self.config.hidden_size, out_features=self.args.num_labels)
+        self.loss_func = torch.nn.CrossEntropyLoss()
+
+    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
+        if self.args.model_type == 'fengshen-megatron_t5':
+            bert_output = self.bert_encoder(
+                input_ids=input_ids, attention_mask=attention_mask)  # (bsz, seq, dim)
+            encode = bert_output.last_hidden_state[:, 0, :]
+        else:
+            bert_output = self.bert_encoder(
+                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)  # (bsz, seq, dim)
+            encode = bert_output[1]
+        logits = self.cls_layer(encode)
+        if labels is not None:
+            loss = self.loss_func(logits, labels.view(-1,))
+            return loss, logits
+        else:
+            return 0, logits
+
+
+class LitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--num_labels', default=2, type=int)
+
+        return parent_args
+
+    def __init__(self, args, num_data):
+        super().__init__()
+        self.args = args
+        self.num_data = num_data
+        self.model = model_dict[args.model_type].from_pretrained(
+            args.pretrained_model_path)
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+        # Calculate total steps
+        if self.trainer.max_epochs > 0:
+            world_size = self.trainer.world_size
+            tb_size = self.hparams.train_batchsize * max(1, world_size)
+            ab_size = self.trainer.accumulate_grad_batches
+            self.total_steps = (len(train_loader.dataset) *
+                                self.trainer.max_epochs // tb_size) // ab_size
+        else:
+            self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+        print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        del batch['id']
+        output = self.model(**batch)
+        loss, logits = output[0], output[1]
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('train_loss', loss)
+        self.log('train_acc', acc)
+        return loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        del batch['id']
+        output = self.model(**batch)
+        loss, logits = output[0], output[1]
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('val_loss', loss)
+        self.log('val_acc', acc, sync_dist=True)
+
+    def predict_step(self, batch, batch_idx):
+        ids = batch['id']
+        del batch['id']
+        output = self.model(**batch)
+        return {ids, output.logits}
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+class TaskModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./log/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         every_n_epochs=1,
+                                         filename=args.filename)
+
+
+def save_test(data, args, data_model, rank):
+    file_name = args.output_save_path + f'.{rank}'
+    with open(file_name, 'w', encoding='utf-8') as f:
+        idx = 0
+        for i in range(len(data)):
+            ids, batch = data[i]
+            for id, sample in zip(ids, batch):
+                tmp_result = dict()
+                label_id = np.argmax(sample.cpu().numpy())
+                tmp_result['id'] = id.item()
+                tmp_result['label'] = data_model.id2label[label_id]
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data+'\n')
+                idx += 1
+    print('save the result to '+file_name)
+
+
+def main():
+    pl.seed_everything(42)
+
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--pretrained_model_path', default='', type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json', type=str)
+    total_parser.add_argument('--model_type',
+                              default='huggingface-bert', type=str)
+
+    # * Args for data preprocessing
+    total_parser = TaskDataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = pl.Trainer.add_argparse_args(total_parser)
+    total_parser = TaskModelCheckpoint.add_argparse_args(total_parser)
+
+    # * Args for base model
+    from fengshen.models.model_utils import add_module_args
+    total_parser = add_module_args(total_parser)
+    total_parser = LitModel.add_model_specific_args(total_parser)
+
+    args = total_parser.parse_args()
+    print(args.pretrained_model_path)
+
+    checkpoint_callback = TaskModelCheckpoint(args).callbacks
+    early_stop_callback = EarlyStopping(
+        monitor="val_acc", min_delta=0.00, patience=5, verbose=False, mode="max")
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer.from_argparse_args(args,
+                                            callbacks=[
+                                                checkpoint_callback,
+                                                lr_monitor,
+                                                early_stop_callback]
+                                            )
+
+    data_model = TaskDataModel(args)
+    model = LitModel(args, len(data_model.train_dataloader()))
+
+    trainer.fit(model, data_model)
+    result = trainer.predict(
+        model, data_model, ckpt_path=trainer.checkpoint_callback.best_model_path)
+    save_test(result, args, data_model, trainer.global_rank)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/classification/finetune_classification.sh b/fengshen/examples/classification/finetune_classification.sh
new file mode 100644
index 0000000000000000000000000000000000000000..993071ceb0ceeb44c0bf887abcdbc0c9f982c4d5
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=2 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=16G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+
+
+MODEL_TYPE=fengshen-roformer
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Zhouwenwang-Unified-110M
+
+ROOT_PATH=cognitive_comp
+TASK=tnews
+
+DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/modelevaluation/tnews/
+OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test1.1.json \
+        --train_batchsize 32 \
+        --valid_batchsize 128 \
+        --max_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.00002 \
+        --weight_decay 0.1 \
+        --num_labels 15 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir ./log/ \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        --model_type $MODEL_TYPE \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif
+SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py
+
+python3 $SCRIPT_PATH $options
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bart-base_afqmc.sh b/fengshen/examples/classification/finetune_classification_bart-base_afqmc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2700d2ad3d6fca47238db033781905ac372b183a
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bart-base_afqmc.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=afqmc-bart-base # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/cache/torch_extendsions
+
+MODEL_NAME=bart-base
+
+TASK=afqmc
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=8
+VAL_BATCH_SIZE=32
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/gaoxinyu/pretrained_model/$MODEL_NAME/
+
+
+CHECKPOINT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/ckpt/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK}
+OUTPUT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK}/predict.json
+
+
+config_json="./ds_config.${MODEL_NAME}.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-7,
+      "eps": 1e-12,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 1e-5,
+      "warmup_max_lr": 1e-4,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 64 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-6 \
+        --weight_decay 1e-2 \
+        --warmup 0.01 \
+        --num_labels 2 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 67 \
+        --gpus 2 \
+        --num_nodes 1 \
+        --strategy $STRATEGY \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif
+SCRIPT_PATH=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bart-base_ocnli.sh b/fengshen/examples/classification/finetune_classification_bart-base_ocnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6ef4886993eb2c1c8938180c940ece9bb156b73f
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bart-base_ocnli.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=ocnli-bart-base # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/cache/torch_extendsions
+
+MODEL_NAME=bart-base
+
+TASK=ocnli
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=8
+VAL_BATCH_SIZE=32
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/gaoxinyu/pretrained_model/$MODEL_NAME/
+
+
+CHECKPOINT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/ckpt/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK}
+OUTPUT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK}/predict.json
+
+
+config_json="./ds_config.${MODEL_NAME}.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-7,
+      "eps": 1e-12,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 1e-8,
+      "warmup_max_lr": 1e-6,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-6 \
+        --weight_decay 1e-2 \
+        --warmup 0.01 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 67 \
+        --gpus 2 \
+        --num_nodes 1 \
+        --strategy $STRATEGY \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif
+SCRIPT_PATH=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_afqmc.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_afqmc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d36b627d6cc1b0a8de575138eec6a7529b31137
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_afqmc.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+#SBATCH --job-name=afqmc # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=4 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=20 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:4 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+set -x -e
+echo "START TIME: $(date)"
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/cache/torch_extendsions
+
+BERT_NAME=bert-3.9B
+
+TASK=afqmc
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=8
+VAL_BATCH_SIZE=32
+ZERO_STAGE=2
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/gaoxinyu/pretrained_model/$BERT_NAME/
+
+
+CHECKPOINT_PATH=/cognitive_comp/gaoxinyu/ln_model/fintune/ckpt/fengshen-finetune/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/gaoxinyu/ln_model/finetune/${BERT_NAME}-${TASK}
+OUTPUT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/${BERT_NAME}-${TASK}/predict.json
+
+
+config_json="./ds_config.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 1000,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": 2
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-7,
+      "eps": 1e-12,
+      "weight_decay": 1e-1
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 1e-8,
+      "warmup_max_lr": 1e-6,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-5 \
+        --weight_decay 1e-2 \
+        --warmup 0.01 \
+        --num_labels 2 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 0 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 67 \
+        --gpus 4 \
+        --num_nodes 1 \
+        --strategy $STRATEGY \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --precision 16 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif
+SCRIPT_PATH=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun -N 1 --job-name=afqmc --jobid=151522 --ntasks=4 --cpus-per-task=15 --gres=gpu:4 -o %x-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_cmnli.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_cmnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..da10752cff77be9462d17cbb45882543a5e0ed48
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_cmnli.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions
+
+BERT_NAME=bert-3.9B
+
+TASK=cmnli
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=16
+VAL_BATCH_SIZE=56
+ZERO_STAGE=2
+
+
+ROOT_PATH=cognitive_comp
+DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/
+
+
+CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/fengshen/fengshen/scripts/log/$TASK/$BERT_NAME/
+OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json
+
+
+config_json="./ds_config.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 6553600,
+        "stage3_prefetch_bucket_size": 5898240,
+        "stage3_param_persistence_threshold": 25600,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-6,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-3
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-8,
+      "warmup_max_lr": 1e-6
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.000001 \
+        --weight_decay 0.001 \
+        --warmup 0.001 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 2 \
+        --strategy deepspeed_stage_3 \
+        --precision 16 \
+        --gradient_clip_val 0.1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif
+SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_iflytek.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_iflytek.sh
new file mode 100644
index 0000000000000000000000000000000000000000..13e08efc318a60eabec72cd4357f8aa9dd558f44
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_iflytek.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions
+
+BERT_NAME=bert-3.9B
+
+TASK=iflytek
+TEXTA_NAME=sentence
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=16
+VAL_BATCH_SIZE=56
+ZERO_STAGE=2
+
+
+ROOT_PATH=cognitive_comp
+DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/
+
+
+CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/Fengshenbang-LM/fengshen/scripts/log/$TASK
+OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json
+
+
+config_json="./ds_config.$SLURM_JOBID.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 6553600,
+        "stage3_prefetch_bucket_size": 5898240,
+        "stage3_param_persistence_threshold": 25600,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-5,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-5
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.00001 \
+        --weight_decay 0.01 \
+        --warmup 0.001 \
+        --num_labels 119 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 2 \
+        --strategy deepspeed_stage_3 \
+        --precision 16 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif
+SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_ocnli.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_ocnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8d3107931f88671d54d50325b8d469a12ee4e224
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_ocnli.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions
+
+BERT_NAME=bert-1.3B
+
+TASK=ocnli
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=16
+VAL_BATCH_SIZE=56
+ZERO_STAGE=2
+
+
+ROOT_PATH=cognitive_comp
+DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/
+
+
+CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/fengshen/fengshen/scripts/log/$TASK/$BERT_NAME
+OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json
+
+
+config_json="./ds_config.$SLURM_JOBID.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 6553600,
+        "stage3_prefetch_bucket_size": 5898240,
+        "stage3_param_persistence_threshold": 25600,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-6,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-6
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-8,
+      "warmup_max_lr": 1e-6,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.000001 \
+        --weight_decay 0.001 \
+        --warmup 0.001 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 2 \
+        --strategy deepspeed_stage_3 \
+        --precision 16 \
+        --gradient_clip_val 0.1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif
+SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_tnews.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_tnews.sh
new file mode 100644
index 0000000000000000000000000000000000000000..62a2349bd4ce90d20f9747fd570cb070ea60be2f
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_tnews.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=4 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:4 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions
+
+BERT_NAME=bert-3.9B
+
+TASK=tnews
+TEXTA_NAME=sentence
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=16
+VAL_BATCH_SIZE=56
+ZERO_STAGE=2
+
+
+ROOT_PATH=cognitive_comp
+DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/
+
+
+CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/fengshen/fengshen/scripts/log/$TASK/$BERT_NAME/nograd
+OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json
+
+
+config_json="./ds_config.$SLURM_JOBID.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 6553600,
+        "stage3_prefetch_bucket_size": 5898240,
+        "stage3_param_persistence_threshold": 25600,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-5,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-8,
+      "warmup_max_lr": 1e-5,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.00001 \
+        --weight_decay 0.01 \
+        --warmup 0.001 \
+        --num_labels 15 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 4 \
+        --strategy deepspeed_stage_3 \
+        --precision 16 \
+        --gradient_clip_val 0.1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif
+SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_wsc.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_wsc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5d05662f1a2252de3bbd4fd9719ef8d3262d9c63
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_wsc.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions
+
+BERT_NAME=bert-3.9B
+
+TASK=wsc
+TEXTA_NAME=texta
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=16
+VAL_BATCH_SIZE=56
+ZERO_STAGE=2
+
+
+ROOT_PATH=cognitive_comp
+DATA_DIR=/cognitive_comp/yangping/data/unidata/multichoice/mrc_multichoice_data/other/cluewsc2020/
+PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/
+
+
+CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/
+DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/Fengshenbang-LM/fengshen/scripts/log/$TASK
+OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json
+
+
+config_json="./ds_config.$SLURM_JOBID.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 6553600,
+        "stage3_prefetch_bucket_size": 5898240,
+        "stage3_param_persistence_threshold": 25600,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-5,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-5
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.00001 \
+        --weight_decay 0.01 \
+        --warmup 0.001 \
+        --num_labels 2 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 10 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 2 \
+        --strategy deepspeed_stage_3 \
+        --precision 16 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 10 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif
+SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_zen1-base_afqmc.sh b/fengshen/examples/classification/finetune_classification_zen1-base_afqmc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..845e93093cc6390db2c332c22e860ff88688a657
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_zen1-base_afqmc.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+#SBATCH --job-name=afqmc-bart-base # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=fengshen-zen1
+
+TASK=afqmc
+TEXTA_NAME=sentence1
+TEXTB_NAME=sentence2
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=8
+VAL_BATCH_SIZE=32
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+
+config_json="${ROOT_DIR}/ds_config.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-7,
+      "eps": 1e-12,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 1e-5,
+      "warmup_max_lr": 1e-4,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 64 \
+        --texta_name $TEXTA_NAME \
+        --textb_name $TEXTB_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-5 \
+        --weight_decay 1e-2 \
+        --warmup 0.01 \
+        --num_labels 2 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --num_nodes 1 \
+        --strategy $STRATEGY \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+source activate base
+# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/finetune_classification_zen1-base_tnews.sh b/fengshen/examples/classification/finetune_classification_zen1-base_tnews.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eaa50ddac4376c8e86000852da138d0d4779126d
--- /dev/null
+++ b/fengshen/examples/classification/finetune_classification_zen1-base_tnews.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+#SBATCH --job-name=afqmc-bart-base # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+export CUDA_VISIBLE_DEVICES='5'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=fengshen-zen1
+
+TASK=tnews
+TEXTA_NAME=sentence
+LABEL_NAME=label
+ID_NAME=id
+
+
+BATCH_SIZE=8
+VAL_BATCH_SIZE=32
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+
+config_json="${ROOT_DIR}/ds_config.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+# reduce_bucket_size: hidden_size*hidden_size
+# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size
+# stage3_param_persistence_threshold: 10 * hidden_size
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $BATCH_SIZE,
+  "steps_per_print": 100,
+  "gradient_clipping": 0.1,
+  "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 2e-5,
+      "eps": 1e-12,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 2e-8,
+      "warmup_max_lr": 2e-5,
+      "warmup_num_steps": 400,
+      "warmup_type": "linear"
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test1.1.json \
+        --train_batchsize $BATCH_SIZE \
+        --valid_batchsize $VAL_BATCH_SIZE \
+        --max_length 128 \
+        --texta_name $TEXTA_NAME \
+        --label_name $LABEL_NAME \
+        --id_name $ID_NAME \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 1e-5 \
+        --weight_decay 1e-2 \
+        --warmup 0.01 \
+        --num_labels 15 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 1 \
+        --num_nodes 1 \
+        --strategy $STRATEGY \
+        --gradient_clip_val 1.0 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 1.0 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_save_path $OUTPUT_PATH \
+        --model_type $MODEL_NAME \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py
+
+# python3 $SCRIPT_PATH $options
+source activate base
+singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/classification/readme.md b/fengshen/examples/classification/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..b90ce5a946acf55a6530b3c8d010a5ec2642f6ae
--- /dev/null
+++ b/fengshen/examples/classification/readme.md
@@ -0,0 +1,23 @@
+## 分类下游任务
+
+在当前目录下，我们提供丰富的分类任务的示例，其中我们提供三个一键式运行的示例。
+
+- demo_classification_afqmc_roberta.sh              使用DDP微调roberta
+- demo_classification_afqmc_roberta_deepspeed.sh    结合deepspeed微调roberta，获得更快的运算速度
+- demo_classification_afqmc_erlangshen_offload.sh   仅需7G显存即可微调我们效果最好的二郎神系列模型
+
+上述示例均采用AFQMC的数据集，关于数据集的介绍可以在[这里](https://www.cluebenchmarks.com/introduce.html)找到。
+同时我们处理过的数据文件已经放在Huggingface上，点击[这里](https://huggingface.co./datasets/IDEA-CCNL/AFQMC)直达源文件。
+仅需要按我们的格式稍微处理一下数据集，即可适配下游不同的分类任务。
+在脚本示例中，仅需要修改如下参数即可适配本地文件
+```
+        --dataset_name IDEA-CCNL/AFQMC \
+
+-------> 修改为
+
+        --data_dir $DATA_DIR \          # 数据目录
+        --train_data train.json \       # 数据文件
+        --valid_data dev.json \
+        --test_data test.json \
+
+```
\ No newline at end of file
diff --git a/fengshen/examples/clip_finetune/clip_finetune_flickr.py b/fengshen/examples/clip_finetune/clip_finetune_flickr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cac74d87e861cf0ffff64c9ca03330208db90c3
--- /dev/null
+++ b/fengshen/examples/clip_finetune/clip_finetune_flickr.py
@@ -0,0 +1,259 @@
+import sys
+sys.path.append('../../')
+from data.clip_dataloader.flickr import FlickrDataModule
+import pytorch_lightning as pl
+import numpy as np
+import torch
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
+import torch.nn.functional as F
+import math
+import copy
+import argparse
+from transformers import CLIPModel, BertForSequenceClassification
+
+class CLIPLightning(pl.LightningModule):
+    def __init__(self, model_name='ViT-B/32', minibatch_size=2):
+        """A lightning wrapper for a CLIP model as specified in the paper.
+
+        Args:
+            model_name (str): A case sensitive visual model name.
+            config (dict): A dictionary containing the CLIP instantiation parameters.
+        """
+        super().__init__()
+
+        self.prepare_data_per_node = True
+        self.model_name = 'ViT-B/32'
+        # self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")  # NOTE load from openAI
+        self.text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese")
+        self.minibatch_size = minibatch_size
+        self.isViT = 'ViT' in self.model_name
+        self.automatic_optimization = False
+
+    # Training loss: https://github.com/openai/CLIP/issues/83
+    # Mini-batching thanks to https://github.com/crowsonkb / https://twitter.com/RiversHaveWings
+    # Multi-GPU support: https://github.com/MicPie/clasp
+
+    def training_step(self, train_batch, idx):
+        # get optimizers and scheduler
+        optimizer = self.optimizers()
+
+        image, text, labels = train_batch
+        n = math.ceil(len(image) // self.minibatch_size)
+        image_mbs = torch.chunk(image, n)
+        text_mbs = torch.chunk(text, n)
+
+        with torch.no_grad():
+            ims = [F.normalize(self.clip_model.get_image_features(im), dim=1) for im in image_mbs]
+            txt = [F.normalize(self.text_encoder(t).logits, dim=1) for t in text_mbs]
+            # gather from all GPUs 这里的LOSS要把所有GPU的汇集起来一起算才对
+            ims = self.all_gather(torch.cat(ims))
+            txt = self.all_gather(torch.cat(txt))
+
+            if len(ims.shape) == 3:
+                ims = list(ims)
+                txt = list(txt)
+            else:
+                ims = [ims]
+                txt = [txt]
+
+            image_logits = torch.cat(ims) @ torch.cat(txt).t() * self.clip_model.logit_scale.exp()
+            ground_truth = torch.arange(len(image_logits)).long().to(image_logits.device)
+            loss = (F.cross_entropy(image_logits, ground_truth) +
+                    F.cross_entropy(image_logits.t(), ground_truth)).div(2)
+            acc_i = (torch.argmax(image_logits, 1) == ground_truth).sum()
+            acc_t = (torch.argmax(image_logits, 0) == ground_truth).sum()
+            self.log_dict({'loss': loss / len(ims), 'acc': (acc_i + acc_t) / 2 / len(image) / len(ims)}, prog_bar=True)
+
+        if isinstance(optimizer, list):
+            optimizer = optimizer[0]
+        optimizer.zero_grad()
+
+        # image loss
+        for j, mb in enumerate(image_mbs[:-1]):
+            # 最后一部分样本舍弃。（对齐的bug）
+            images_tmp = copy.deepcopy(ims)
+            images_tmp[self.global_rank][j * self.minibatch_size:(j+1)*self.minibatch_size] = \
+                F.normalize(self.clip_model.get_image_features(mb), dim=1)
+            image_logits = torch.cat(images_tmp) @ torch.cat(txt).t() * self.clip_model.logit_scale.exp()
+            ground_truth = torch.arange(len(image_logits)).long().to(image_logits.device)
+            loss = (F.cross_entropy(image_logits, ground_truth) + F.cross_entropy(image_logits.t(), ground_truth))/2
+            self.manual_backward(loss)
+
+        # text loss
+        for j, mb in enumerate(text_mbs[:-1]):
+            text_tmp = copy.deepcopy(txt)
+            text_tmp[self.global_rank][j * self.minibatch_size:(j+1)*self.minibatch_size] = \
+                F.normalize(self.text_encoder(mb).logits, dim=1)
+            image_logits = torch.cat(ims) @ torch.cat(text_tmp).t() * self.clip_model.logit_scale.exp()
+            loss = (F.cross_entropy(image_logits, ground_truth) + F.cross_entropy(image_logits.t(), ground_truth))/2
+            self.manual_backward(loss)
+
+        optimizer.step()
+        lr_scheduler = self.lr_schedulers()
+        lr_scheduler.step()
+        self.clip_model.logit_scale.data.clamp_(-np.log(100), np.log(100))
+
+    def validation_step(self, val_batch, idx):
+        image, text, labels = val_batch
+        img_embed = self.clip_model.get_image_features(image)
+        txt_embed = self.text_encoder(text).logits
+        # print(img_embed.shape)
+        image_norm = F.normalize(img_embed, dim=1)
+        text_norm = F.normalize(txt_embed, dim=1)
+        image_logits = image_norm @ text_norm.t() * self.clip_model.logit_scale.exp()
+        text_logits = text_norm @ image_norm.t() * self.clip_model.logit_scale.exp()
+        # print(image_logits.shape)
+        # image_logits, text_logits = self.forward(image, text)
+        ground_truth = torch.arange(len(image_logits)).long().to(image_logits.device)
+        loss = (F.cross_entropy(image_logits, ground_truth) + F.cross_entropy(text_logits, ground_truth)).div(2)
+        self.log('val_loss', loss, prog_bar=True)
+        return [image_norm, text_norm, labels]
+
+    def validation_epoch_end(self, outputs):
+        image_features = torch.cat([x[0] for x in outputs])
+        text_features = torch.cat([x[1] for x in outputs])
+        labels = [label for x in outputs for label in x[2]]
+        print(image_features.shape, text_features.shape, len(labels))
+        self.get_metrics(image_features, text_features, labels, 100)
+
+    def test_step(self, test_batch, idx):
+        image, text, labels = test_batch
+        image_features = self.clip_model.get_image_features(image)
+        text_features = self.text_encoder(text).logits
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        return [image_features, text_features, labels]
+
+    def test_epoch_end(self, outputs):
+        image_features = torch.cat([x[0] for x in outputs])
+        text_features = torch.cat([x[1] for x in outputs])
+        labels = [label for x in outputs for label in x[2]]
+        print(image_features.shape, text_features.shape, len(labels))
+        self.get_metrics(image_features, text_features, labels, 100)
+
+    def get_metrics(self, image_features, text_features, labels, logit_scale):
+        # 计算相似度，支持多个样本的情况（比如一个图片有多个caption）
+        # img2txt计算的时候要用到，因为一张图片可能对应多个文本。
+        # txt2img计算的时候不需要（一般一个text只有一个对应图片）
+        # metrics = {}
+        logits_per_image = (logit_scale * image_features @ text_features.t()).detach().cpu()
+        logits_per_text = logits_per_image.t().detach().cpu()
+
+        logits = {"image_to_text": logits_per_image, "text_to_image": logits_per_text}
+
+        label2idx = {}  # 计算label到idx的映射。
+        repeat_id = []
+        for i, label in enumerate(labels):
+            if label not in label2idx:
+                label2idx[label] = [i]
+            else:
+                # 表示该index的标签出现过，记录这个index，后续算txt2img分数的时候，这些index的权值要降低。
+                label2idx[label].append(i)
+                repeat_id.append(i)
+        # print(label2idx)    # 标注了每个label的idx
+
+        # print('repeat_id:', repeat_id)
+        ground_truth = [label2idx[label] for label in labels]
+        # print(ground_truth)
+
+        for name, logit in logits.items():
+            # print(name, logit.shape)
+            if name == 'text_to_image':
+                logit[:, repeat_id] -= 1e8   # 这部分的分数要降低。（重复出现的图片，直接忽略）
+            r1_stat, r5_stat, r10_stat = [], [], []
+            ranking = torch.argsort(logit, descending=True)  # index of the largest element to the smallest
+            # print(name, ranking[:, :10])
+            for i, each_query in enumerate(ranking[:, :10]):
+                for j, q in enumerate(each_query):
+                    if q in ground_truth[i]:
+                        if j == 0:
+                            r1_stat.append(1)
+                            r5_stat.append(1)
+                            r10_stat.append(1)
+                            break
+                        if j < 5:
+                            r5_stat.append(1)
+                            r10_stat.append(1)
+                            break
+                        if j < 10:
+                            r10_stat.append(1)
+                            break
+            print(f'{name} r1:{sum(r1_stat)/len(logit)}, r5:{sum(r5_stat)/len(logit)}, r10:{sum(r10_stat)/len(logit)}')
+
+    def configure_optimizers(self):
+        lr = {
+            "RN50": 5e-4,
+            "RN101": 5e-4,
+            "RN50x4": 5e-4,
+            "RN50x16": 4e-4,
+            "RN50x64": 3.6e-4,
+            "ViT-B/32": 5e-4,
+            "ViT-B/16": 5e-4,
+            "ViT-L/14": 4e-4,
+            "ViT-L/14-336px": 2e-5
+        }[self.model_name]
+
+        optimizer = torch.optim.AdamW(
+            [{'params': self.clip_model.parameters()}, {'params': self.text_encoder.parameters()}],
+            lr=lr,
+            betas=(
+                0.9,
+                0.98 if self.isViT else 0.999
+            ),
+            eps=1e-6 if self.isViT else 1e-8,
+            weight_decay=0.2
+        )
+
+        # Source: https://github.com/openai/CLIP/issues/107
+        # Use pip install 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'
+        lr_scheduler = CosineAnnealingWarmRestarts(
+            optimizer,
+            T_0=2000
+        )
+        # CosineAnnealingWarmupRestarts
+        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler}
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    # model_name
+    parser.add_argument('--model', type=str,
+                        default="ViT-B/32",
+                        help='model definition')
+
+    # experiment setting
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--num_epoches', type=int, default=1)
+    parser.add_argument('--num_gpus', type=int, default=2)
+
+    # dataset
+    parser.add_argument('--train_filename', type=str,
+                        help='dir or csv file')
+    parser.add_argument('--train_root', type=str,
+                        help='image root path')
+    parser.add_argument('--val_filename', type=str,
+                        help='dir or csv file')
+    parser.add_argument('--val_root', type=str,
+                        help='image root path')
+    parser.add_argument('--test_filename', type=str,
+                        help='dir or csv file')
+    parser.add_argument('--test_root', type=str,
+                        help='image root path')
+    parser.add_argument('--num_workers', type=int, default=0)
+
+    # huggingface pretrain model 定义
+    parser.add_argument('--pretrain_model', type=str,
+                        default="openai/clip-vit-base-patch32",
+                        help='defalut load from openai')    # "wf-genius/TaiYi-CLIP-ViT-B-32" 是我训好的 NOTE
+
+    args = parser.parse_args()
+    dm = FlickrDataModule(args)
+
+    model = CLIPLightning(model_name=args.model, minibatch_size=args.batch_size//2)
+    trainer = pl.Trainer(gpus=args.num_gpus, precision=16, max_epochs=args.num_epoches)
+    trainer.test(model, dm)  # zero-shot test
+    trainer.fit(model, dm)  # finetune on train set
+    trainer.test(model, dm)  # test again
+
diff --git a/fengshen/examples/clip_finetune/finetune_flickr.sh b/fengshen/examples/clip_finetune/finetune_flickr.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e8f8c79decdbd4a070188fbfa976bd4b90d0d8d
--- /dev/null
+++ b/fengshen/examples/clip_finetune/finetune_flickr.sh
@@ -0,0 +1,10 @@
+python clip_finetune_flickr.py --batch_size 512 \
+--num_gpus 1 \
+--num_workers 20 \
+--train_filename /shared_space/ccnl/mm_data/Flickr30k-CNA/train/flickr30k_cna_train.txt \
+--val_filename /shared_space/ccnl/mm_data/Flickr30k-CNA/val/flickr30k_cna_val.txt \
+--test_filename /shared_space/ccnl/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt \
+--train_root /shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images \
+--val_root /shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images \
+--test_root /shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images \
+
diff --git a/fengshen/examples/clue1.1/README.md b/fengshen/examples/clue1.1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..63856c5a596db8f968a7dcebcc03d85ff8c3a49f
--- /dev/null
+++ b/fengshen/examples/clue1.1/README.md
@@ -0,0 +1,48 @@
+# 中文 NLP 权威测评基准 CLUE 刷榜 Top10 方案指南
+
+ [CLUE](https://www.cluebenchmarks.com) 是中文 NLP 的权威测评榜单，也吸引了许多国内许多团队在上面进行测评。在我们的最新模型 UniMC 中，也使用 CLUE 对我们的模型进行了测评。在全量数据榜单 CLUE1.1 中，我们的 [UniMC-DeBERTa-1.4B](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-DeBERTa-v2-1.4B-Chinese) 模型取得了第 8 的成绩，是 [CLUE1.1](https://www.cluebenchmarks.com/rank.html) 排行榜(2022年11月14日)前 10 名中唯一开源模型权重和刷榜代码的模型。
+
+## 刷榜方案
+
+通过观察可以发现，在CLUE需要测评的 9 个任务中，有 8 个是分类任务，只有一个 cmrc2018 是抽取式的阅读理解任务。因此，结合我们的 Fengshenbang-LM 已有的模型，我们可以使用 [UniMC](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/dev/yangping/fengshen/examples/unimc) 来实现 8 个是分类任务，用 [Ubert](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/dev/yangping/fengshen/examples/ubert) 来实现 cmrc2018 任务，详细的方案可以看我们的知乎文章：https://zhuanlan.zhihu.com/p/583679722
+
+## 项目要求
+
+安装我们的 fengshen 框架，我们暂且提供如下方式安装
+```shell
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+pip install --editable ./
+```
+## 运行项目
+
+### 数据下载
+由于 HuggingFace 上的数据与最终提交的数据 id 有可能对应不上，所以建议还是去官方仓库进行下载
+https://github.com/CLUEBENCHMARK/CLUE
+
+
+### 数据预处理
+将数据下载之后，修改下面脚本的路径，运行下面脚本将数据处理成 UniMC 模型 和 Ubert 模型所需要的格式
+```shell
+sh cluedata2unidata.sh
+```
+
+### 模型训练
+训练CLUE上的8个分类任务，一些训练参数可根据自己的设备进行修改。对于全量数据来说，训练超参数没有那么大的影响
+```shell
+sh run_clue_unimc.sh
+```
+训练 cmrc2018 任务，一些训练参数可根据自己的设备进行修改
+```shell 
+sh run_clue_ubert.sh
+```
+
+### 预测结果提交
+
+运行下面脚本将预测结果转化为 CLUE 要求的格式，数据路径需要根据自己的路径修改调整。运行下面脚本就可以得到结果，然后拿到 [CLUE](https://www.cluebenchmarks.com/index.html) 官网上去提交了
+
+```shell
+sh predict2submit.sh
+```
+
+
diff --git a/fengshen/examples/clue1.1/cluedata2unidata.sh b/fengshen/examples/clue1.1/cluedata2unidata.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d838604a8b3d39ab90b277f2a467d6d087e7bd54
--- /dev/null
+++ b/fengshen/examples/clue1.1/cluedata2unidata.sh
@@ -0,0 +1,15 @@
+
+CLUEDATA_PATH=./CLUE_DATA   #CLUE 原始数据路径
+UNIDATA_PATH=./data    #处理数据输出路
+
+SCRIPT_PATH=./data_preprocessing
+
+python $SCRIPT_PATH/afqmc_preprocessing.py --data_path=$CLUEDATA_PATH/afqmc_public --save_path=$UNIDATA_PATH/afqmc
+python $SCRIPT_PATH/c3_preprocessing.py --data_path=$CLUEDATA_PATH/c3_public --save_path=$UNIDATA_PATH/c3
+python $SCRIPT_PATH/chid_preprocessing.py --data_path=$CLUEDATA_PATH/chid_public --save_path=$UNIDATA_PATH/chid
+python $SCRIPT_PATH/csl_preprocessing.py --data_path=$CLUEDATA_PATH/csl_public --save_path=$UNIDATA_PATH/csl
+python $SCRIPT_PATH/iflytek_preprocessing.py --data_path=$CLUEDATA_PATH/iflytek_public --save_path=$UNIDATA_PATH/iflytek
+python $SCRIPT_PATH/ocnli_preprocessing.py --data_path=$CLUEDATA_PATH/ocnli_public --save_path=$UNIDATA_PATH/ocnli
+python $SCRIPT_PATH/tnews_preprocessing.py --data_path=$CLUEDATA_PATH/tnews_public --save_path=$UNIDATA_PATH/tnews
+python $SCRIPT_PATH/wsc_preprocessing.py --data_path=$CLUEDATA_PATH/cluewsc2020_public --save_path=$UNIDATA_PATH/wsc
+python $SCRIPT_PATH/cmrc2018_preprocessing.py --data_path=$CLUEDATA_PATH/cmrc2018_public --save_path=$UNIDATA_PATH/cmrc2018
diff --git a/fengshen/examples/clue1.1/data_preprocessing/afqmc_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/afqmc_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9297199bc6f0e0972ec508876680a321ee8a4165
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/afqmc_preprocessing.py
@@ -0,0 +1,59 @@
+import json
+from tqdm import tqdm
+import os
+import argparse
+
+label2desc={"0": "不相似", "1": "相似"}
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            texta = data['sentence1']
+            textb = data['sentence2']
+            question = ''
+            choice = [v for k,v in label2desc.items()]
+            answer = label2desc[data['label']] if 'label' in data.keys() else ''
+            label = choice.index(answer) if 'label' in data.keys() else 0
+            text_id = data['id'] if 'id' in data.keys() else 0
+            result.append({
+                            'task_type':'语义匹配',
+                            'texta':texta,
+                            'textb':textb,
+                            'question':question,
+                            'choice':choice,
+                            'answer':answer,
+                            'label':label,
+                            'id':text_id}) 
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+if __name__=="__main__":
+    
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list = ['train','dev','test']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path),output_path)
diff --git a/fengshen/examples/clue1.1/data_preprocessing/c3_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/c3_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..46f8c268cf829384cc05d2b4c3c01e826d1ad892
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/c3_preprocessing.py
@@ -0,0 +1,72 @@
+import json
+from tqdm import tqdm
+import os
+import argparse
+
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = json.loads(''.join(f.readlines()))
+        result=[]
+        for line in tqdm(lines): 
+            data = line
+            texta = '\n'.join(data[0])
+            textb =''
+            for qa in data[1]:
+                question=qa['question']
+                choice=qa['choice']
+                answer=qa['answer'] if 'answer' in qa.keys() else ''
+                label = qa['choice'].index(answer) if 'answer' in qa.keys() else 0
+                text_id = qa['id'] if 'id' in qa.keys() else 0
+                result.append({'texta':texta,
+                                'textb':textb,
+                                'question':question,
+                                'choice':choice,
+                                'answer':answer,
+                                'label':label,
+                                'id':text_id}) 
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list=['d-train','d-dev','c3-m-train','m-train','m-dev','test1.0','test1.1']
+    train_data = []
+    dev_data = []
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        data=load_data(file_path=file_path)
+        if 'train' in file or 'd-dev' in file:
+            train_data.extend(data)
+        elif 'm-dev' in file:
+            dev_data.extend(data)
+        elif 'test' in file:
+            output_path = os.path.join(save_path,file+'.json')
+            save_data(data,output_path)
+    
+    output_path = os.path.join(save_path,'train.json')
+    save_data(train_data,output_path)
+            
+    output_path = os.path.join(save_path,'dev.json')
+    save_data(dev_data,output_path)
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/data_preprocessing/chid_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/chid_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e55aaf9b1c4ceed02343c5417aa205e570fef26c
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/chid_preprocessing.py
@@ -0,0 +1,159 @@
+import json
+from tqdm import tqdm
+import os
+import re
+import argparse
+
+mask_token='[MASK]'
+label_mask='__'
+
+
+def load_schema(train_answer,dev_answer):
+    with open(train_answer,'r',encoding='utf-8') as f:
+        train2id = json.loads(''.join(f.readlines()))
+    
+    with open(dev_answer,'r',encoding='utf-8') as f:
+        dev2id = json.loads(''.join(f.readlines()))    
+    for k,v in dev2id.items():
+        train2id[k]=v
+
+    return train2id
+    
+
+def cut(sentence):
+    """
+	将一段文本切分成多个句子
+	:param sentence: ['虽然BillRoper正忙于全新游戏
+	:return: ['虽然BillRoper正..接近。' , '与父母，之首。' , '很多..常见。' , '”一位上..推进。' , ''”一直坚..市场。'' , '如今，...的70%。']
+	"""
+    new_sentence = []
+    sen = []
+    for i in sentence: # 虽
+        sen.append(i)
+        if i in ['。', '！', '？', '?',',','，']:
+            new_sentence.append("".join(sen)) #['虽然BillRoper正...接近。' , '与父母，...之首。' , ]
+            sen = []
+
+    if len(new_sentence) <= 1:  # 一句话超过max_seq_length且没有句号的，用","分割，再长的不考虑了。
+        new_sentence = []
+        sen = []
+        for i in sentence:
+            sen.append(i)
+            if i.split(' ')[0] in ['，', ','] and len(sen) != 0:
+                new_sentence.append("".join(sen))
+                sen = []
+
+    if len(sen) > 0:  # 若最后一句话无结尾标点，则加入这句话
+        new_sentence.append("".join(sen))
+    return new_sentence
+
+
+def get_answer_text(text,m):
+    sent_list=cut(text)
+    text1=''
+    text2=''
+    for i,sent in enumerate(sent_list):
+        if m in sent:
+            text1=''.join(sent_list[:i])
+            if i+1>len(sent_list)-1:
+                text2=''
+            else:
+                text2=''.join(sent_list[i+1:])
+            index_text=sent
+            return text1,text2,index_text
+    return '','',''
+    
+
+
+def load_data(file_path,label2id):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for l,line in tqdm(enumerate(lines)): 
+            data = json.loads(line)
+            choice=data['candidates']
+            for s,sent in enumerate(data['content']):
+                masks=re.findall("#idiom\d{6}#", sent)
+                for m in masks:
+                    text1,text2,index_text=get_answer_text(sent,m)
+
+                    masks1=re.findall("#idiom\d{6}#", text1)
+                    for m1 in masks1:
+                        text1=text1.replace(m1,choice[label2id[m1]])
+                    
+                    masks2=re.findall("#idiom\d{6}#", text2)
+                    for m2 in masks2:
+                        text2=text2.replace(m2,choice[label2id[m2]])
+                        
+                    masks3=re.findall("#idiom\d{6}#", index_text)
+                    for m3 in masks3:
+                        if m3!=m:
+                            index_text=index_text.replace(m3,choice[label2id[m3]])
+
+                    choice=[]
+                    for c in data['candidates']:
+                        choice.append(index_text.replace(m,c))
+                        
+                    if len('.'.join(choice))>400:
+                        choice=data['candidates']
+                        text1=text1+index_text.split(m)[0]
+                        text2=index_text.split(m)[1]+text2
+                        
+                    if len(text1)+len(text2)>512-len('.'.join(choice)):
+                        split1=0
+                        split2=0
+                        while split1+split2<512-len('.'.join(choice)):
+                            if split1<len(text1):
+                                split1+=1
+                            if split2<len(text2):
+                                split2+=1
+                        text1=text1[-split1:]
+                        text2=text2[:split2]
+                        
+                    label=label2id[m] if m in label2id.keys() else 0
+                    answer=choice[label] if m in label2id.keys() else ''
+
+                    result.append({'texta':text1,
+                                    'textb':text2,
+                                    'question':'',
+                                    'choice':choice,
+                                    'answer':answer,
+                                    'label':label,
+                                    'id':m,
+                                    'text_id':s,
+                                    'line_id':l}) 
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+
+if __name__=="__main__":
+    
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+        
+    label2id = load_schema(os.path.join(data_path,'train_answer.json'),os.path.join(data_path,'dev_answer.json'))
+    
+    file_list = ['train','dev','test1.1']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path,label2id),output_path)
+        
+        
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/data_preprocessing/cmrc2018_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/cmrc2018_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a8a1de0f6a99062512246f1761ba47324a38998
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/cmrc2018_preprocessing.py
@@ -0,0 +1,126 @@
+import json
+from tqdm import tqdm
+import os
+from sklearn.utils import shuffle
+import re
+import argparse
+
+
+def cut_sent(para):
+    para = re.sub('([。，,！？\?])([^”’])', r"\1\n\2", para)  # 单字符断句符
+    para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
+    para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
+    para = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
+    # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
+    para = para.rstrip()  # 段尾如果有多余的\n就去掉它
+    # 很多规则中会考虑分号;，但是这里
+    return para.split("\n")
+
+
+def search(pattern, sequence):
+    n = len(pattern)
+    res=[]
+    for i in range(len(sequence)):
+        if sequence[i:i + n] == pattern:
+            res.append([i,i + n-1])
+    return res
+
+max_length=512
+stride=128
+def stride_split(question, context, answer, start):
+    end = start + len(answer) -1
+    results, n = [], 0
+    max_c_len = max_length - len(question) - 3
+    while True:
+        left, right = n * stride, n * stride + max_c_len
+        if left <= start < end <= right:
+            results.append((question, context[left:right], answer, start - left, end - left))
+        elif right < start or end < right:
+            results.append((question, context[left:right], '', -1, -1))
+        if right >= len(context):
+            return results
+        n += 1
+
+
+def load_data(file_path,is_training=False):
+    task_type='抽取任务'
+    subtask_type='抽取式阅读理解'
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = json.loads(''.join(f.readlines()))
+        result=[]
+        lines = lines['data']
+        for line in tqdm(lines): 
+            if line['paragraphs']==[]:
+                continue
+            data = line['paragraphs'][0]
+            context=data['context'].strip()
+            for qa in data['qas']:
+                question=qa['question'].strip()
+                rcv=[]
+                for a in qa['answers']:
+                    if a not in rcv:
+                        rcv.append(a)
+                        split=stride_split(question, context, a['text'], a['answer_start'])
+                        for sp in split:
+                            choices = []
+                            
+                            choice = {}
+                            choice['id']=qa['id']
+                            choice['entity_type'] = qa['question']
+                            choice['label']=0
+                            entity_list=[]
+                            if sp[3]>=0 and sp[4]>=0:
+                                entity_list.append({'entity_name':sp[2],'entity_type':'','entity_idx':[[sp[3],sp[4]]]})
+                                
+                            choice['entity_list']=entity_list
+                            choices.append(choice)
+                                
+                            if choices==[]:
+                                print(data)
+                                continue
+                            result.append({ 'task_type':task_type,
+                                            'subtask_type':subtask_type,
+                                            'text':sp[1],
+                                            'choices':choices,
+                                            'id':0}) 
+                            
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list=['dev','train','trial','test']
+    train_data = []
+    dev_data = []
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        data=load_data(file_path=file_path)
+        if 'train' in file or 'trial' in file:
+            train_data.extend(data)
+        else:
+            output_path = os.path.join(save_path,file+'.json')
+            save_data(data,output_path)
+    
+    output_path = os.path.join(save_path,'train.json')
+    save_data(train_data,output_path)
+            
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/data_preprocessing/csl_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/csl_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2762c4a82cc32fcd353d93f12a241bc900ef4624
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/csl_preprocessing.py
@@ -0,0 +1,88 @@
+import json
+from tqdm import tqdm
+import os
+import jieba.analyse
+import argparse
+
+
+label2desc={'1':'可以','0':'不能'}
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            texta = data['abst']
+            abst = data['abst']
+            textb = ''
+            keyword = '、'.join(data['keyword'])
+            question = ''
+
+
+            keyword=data['keyword']
+            rs=jieba.analyse.extract_tags(data['abst'],topK=15)
+            texta='、'.join(rs)+'。'+texta
+            comm=[]
+            for k in keyword:
+                if k in rs:
+                    comm.append(k)
+
+            for word in comm:
+                if word in abst:
+                    abst=abst.replace(word,word+'（共现关键字）')
+
+            comm=[word for word in comm] 
+            keyword=[word for word in data['keyword']] 
+
+            comm_text='共现词汇'+str(len(comm))+'个，分别是'+'、'.join(comm)
+            
+            keyword = '、'.join(keyword)
+            question=''
+
+
+            choice = [f'{v}使用{keyword}概括摘要' for k,v in label2desc.items()]
+            answer = label2desc[data['label']] if 'label' in data.keys() else ''
+            answer = f'{answer}使用{keyword}概括摘要'
+
+            label = choice.index(answer) if 'label' in data.keys() else 0
+            text_id = data['id'] if 'id' in data.keys() else 0
+            result.append({'texta':texta,
+                            'textb':textb,
+                            'question':question,
+                            'choice':choice,
+                            'answer':answer,
+                            'label':label,
+                            'id':text_id}) 
+        for i in range(5):
+            print(result[i])
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list = ['train','dev','test']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path),output_path)
diff --git a/fengshen/examples/clue1.1/data_preprocessing/iflytek_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/iflytek_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a8f5ec44851697ac1a36f299a0a132dcf486b71
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/iflytek_preprocessing.py
@@ -0,0 +1,188 @@
+import json
+from tqdm import tqdm
+import os
+import argparse
+
+label2desc={
+        '银行': '银行',
+        '社区服务': '社区',
+        '电商': '电商',
+        '支付': '支付',
+        '经营养成': '养成',
+        '卡牌': '卡牌',
+        '借贷': '借贷',
+        '驾校': '驾校',
+        '理财': '理财',
+        '职考': '职考',
+        '新闻': '新闻',
+        '旅游资讯': '旅游',
+        '公共交通': '交通',
+        '魔幻': '魔幻',
+        '医疗服务': '医疗',
+        '影像剪辑': '影像',
+        '动作类': '动作',
+        '工具': '工具',
+        '体育竞技': '体育',
+        '小说': '小说',
+        '运动健身': '运动',
+        '相机': '相机',
+        '辅助工具': '辅助',
+        '快递物流': '快递',
+        '高等教育': '教育',
+        '股票': '股票',
+        '菜谱': '菜谱',
+        '行车辅助': '行车',
+        '仙侠': '仙侠',
+        '亲子儿童': '亲子',
+        '购物咨询': '购物',
+        '射击游戏': '射击',
+        '漫画': '漫画',
+        '中小学': '小学',
+        '同城服务': '同城',
+        '成人教育': '成人',
+        '求职': '求职',
+        '电子产品': '电子',
+        '艺术': '艺术',
+        '薅羊毛': '赚钱',
+        '约会社交': '约会',
+        '经营': '经营',
+        '兼职': '兼职',
+        '短视频': '短视',
+        '音乐': '音乐',
+        '英语': '英语',
+        '棋牌中心': '棋牌',
+        '摄影修图': '摄影',
+        '养生保健': '养生',
+        '办公': '办公',
+        '政务': '政务',
+        '视频': '视频',
+        '论坛圈子': '论坛',
+        '彩票': '彩票',
+        '直播': '直播',
+        '其他': '其他',
+        '休闲益智': '休闲',
+        '策略': '策略',
+        '即时通讯': '通讯',
+        '汽车交易': '买车',
+        '违章': '违章',
+        '地图导航': '地图',
+        '民航': '民航',
+        '电台': '电台',
+        '语言(非英语)': '语言',
+        '搞笑': '搞笑',
+        '婚恋社交': '婚恋',
+        '社区超市': '超市',
+        '日常养车': '养车',
+        '杂志': '杂志',
+        '视频教育': '在线',
+        '家政': '家政',
+        '影视娱乐': '影视',
+        '装修家居': '装修',
+        '体育咨讯': '资讯',
+        '社交工具': '社交',
+        '餐饮店': '餐饮',
+        '美颜': '美颜',
+        '问诊挂号': '挂号',
+        '飞行空战': '飞行',
+        '综合预定': '预定',
+        '电影票务': '票务',
+        '笔记': '笔记',
+        '买房': '买房',
+        '外卖': '外卖',
+        '母婴': '母婴',
+        '打车': '打车',
+        '情侣社交': '情侣',
+        '日程管理': '日程',
+        '租车': '租车',
+        '微博博客': '博客',
+        '百科': '百科',
+        '绘画': '绘画',
+        '铁路': '铁路',
+        '生活社交': '生活',
+        '租房': '租房',
+        '酒店': '酒店',
+        '保险': '保险',
+        '问答交流': '问答',
+        '收款': '收款',
+        'MOBA': '竞技',
+        'K歌': '唱歌',
+        '技术': '技术',
+        '减肥瘦身': '减肥',
+        '工作社交': '工作',
+        '团购': '团购',
+        '记账': '记账',
+        '女性': '女性',
+        '公务员': '公务',
+        '二手': '二手',
+        '美妆美业': '美妆',
+        '汽车咨询': '汽车',
+        '行程管理': '行程',
+        '免费WIFI': '免费',
+        '教辅': '教辅',
+        '成人': '两性',
+        '出国': '出国',
+        '婚庆': '婚庆',
+        '民宿短租': '民宿'}
+
+choice = [k for k,v in label2desc.items()]
+print('1'.join(choice))
+print(len('1'.join(choice)))
+
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            texta = data['sentence']
+            textb = ''
+            question = '请问app应用属于？'
+            
+            choice = [v for k,v in label2desc.items()]
+            answer = label2desc[data['label_des']] if 'label_des' in data.keys() else ''
+
+            # choice = [k for k,v in label2desc.items()]
+            # answer = data['label_des'] if 'label_des' in data.keys() else ''
+
+            label = choice.index(answer) if 'label_des' in data.keys() else 0
+            text_id = data['id'] if 'id' in data.keys() else 0
+            result.append({'texta':texta,
+                            'textb':textb,
+                            'question':question,
+                            'choice':choice,
+                            'answer':answer,
+                            'label':label,
+                            'id':text_id}) 
+        # for i in range(5):
+        #     print(result[i])
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list = ['train','dev','test']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path),output_path)
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/data_preprocessing/ocnli_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/ocnli_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..344a8ea7b7049b9f4373ad4c36dc284c395b0034
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/ocnli_preprocessing.py
@@ -0,0 +1,60 @@
+import json
+from tqdm import tqdm
+import os
+import argparse
+
+
+label2desc={'contradiction':'矛盾','neutral':'自然','entailment':'蕴含'}
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            texta = data['sentence1']
+            textb = data['sentence2']
+            question = ''
+            choice = [v for k,v in label2desc.items()]
+            answer = label2desc[data['label']] if 'label' in data.keys() else ''
+            label = choice.index(answer) if 'label' in data.keys() else 0
+            text_id = data['id'] if 'id' in data.keys() else 0
+            result.append({'task_type':'自然语言推理',
+                           'texta':texta,
+                            'textb':textb,
+                            'question':question,
+                            'choice':choice,
+                            'answer':answer,
+                            'label':label,
+                            'id':text_id}) 
+        for i in range(5):
+            print(result[i])
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list = ['train','dev','test']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path),output_path)
diff --git a/fengshen/examples/clue1.1/data_preprocessing/tnews_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/tnews_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f187fac71b411d77273a1a45544eb9c35151bc9
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/tnews_preprocessing.py
@@ -0,0 +1,71 @@
+import json
+from tqdm import tqdm
+import argparse
+
+label2desc={"news_story": "故事",
+              "news_culture": "文化",
+              "news_entertainment": "娱乐",
+              "news_sports": "体育",
+              "news_finance": "财经",
+              "news_house": "房产",
+              "news_car": "汽车",
+              "news_edu": "教育",
+              "news_tech": "科技",
+              "news_military": "军事",
+              "news_travel": "旅游",
+              "news_world": "国际",
+              "news_stock": "股票",
+              "news_agriculture": "农业",
+              "news_game": "电竞"}
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            texta = data['sentence']
+            textb = ''
+            question = '下面新闻属于哪一个类别？'
+            choice = [v for k,v in label2desc.items()]
+            answer = label2desc[data['label_desc']] if 'label_desc' in data.keys() else ''
+            label = choice.index(answer) if 'label_desc' in data.keys() else 0
+            text_id = data['id'] if 'id' in data.keys() else 0
+            result.append({'texta':texta,
+                            'textb':textb,
+                            'question':question,
+                            'choice':choice,
+                            'answer':answer,
+                            'label':label,
+                            'id':text_id}) 
+        print(result[0])
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+import os
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list = ['train','dev','test1.0','test1.1']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path),output_path)
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/data_preprocessing/wsc_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/wsc_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b5ec0a7625ac870e71e77cf8af6256e0a1609c
--- /dev/null
+++ b/fengshen/examples/clue1.1/data_preprocessing/wsc_preprocessing.py
@@ -0,0 +1,81 @@
+import json
+from tqdm import tqdm
+import os
+import argparse
+
+label2desc={'true':'是','false':'不是'}
+
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            target = data['target']
+            text=list(data['text'])
+            if target['span2_index']<target['span1_index']:
+                text.insert(target['span2_index'],'_')
+                text.insert(target['span2_index']+len(target['span2_text'])+1,'_')
+                text.insert(target['span1_index']+2,'[')
+                text.insert(target['span1_index']+2+len(target['span1_text'])+1,']')
+            else:
+                text.insert(target['span1_index'],'[')
+                text.insert(target['span1_index']+len(target['span1_text'])+1,']')
+                text.insert(target['span2_index']+2,'_')
+                text.insert(target['span2_index']+2+len(target['span2_text'])+1,'_')
+
+            texta = ''.join(text)
+            textb = ''
+            span2_text=target['span2_text']
+            span1_text=target['span1_text']
+
+            question = ''
+
+            choice = []
+            for k,v in label2desc.items():
+                choice .append(f'{span2_text}{v}{span1_text}')
+            # print(choice)
+            answer = label2desc[data['label']] if 'label' in data.keys() else ''
+            answer = f'{span2_text}{answer}{span1_text}'
+
+            label = choice.index(answer) if 'label' in data.keys() else 0
+            text_id = data['id'] if 'id' in data.keys() else 0
+            result.append({'texta':texta,
+                            'textb':textb,
+                            'question':question,
+                            'choice':choice,
+                            'answer':answer,
+                            'label':label,
+                            'id':text_id}) 
+        for i in range(5):
+            print(result[i])
+        return result
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    
+    
+    data_path = args.data_path
+    save_path = args.save_path
+    
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    
+    file_list = ['train','dev','test1.0','test1.1']
+    for file in file_list:
+        file_path = os.path.join(data_path,file+'.json')
+        output_path = os.path.join(save_path,file+'.json')
+        save_data(load_data(file_path),output_path)
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit.sh b/fengshen/examples/clue1.1/predict2submit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9da7445929bef2c2e3829d0827b26fd5ba8d2d0f
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit.sh
@@ -0,0 +1,16 @@
+
+PRED_DATA_PATH=./predict
+SUBMIT_DATA_PATH=./submit
+
+SCRIPT_PATH=./predict2submit
+
+python $SCRIPT_PATH/afqmc_submit.py --data_path=$PRED_DATA_PATH/afqmc-M4_predict.json --save_path=$SUBMIT_DATA_PATH/afqmc_predict.json
+python $SCRIPT_PATH/c3_submit.py --data_path=$PRED_DATA_PATH/c3_predict.json --save_path=$SUBMIT_DATA_PATH/c311_predict.json
+python $SCRIPT_PATH/chid_submit.py --data_path=$PRED_DATA_PATH/chid_predict.json --save_path=$SUBMIT_DATA_PATH/chid11_predict.json
+python $SCRIPT_PATH/csl_submit.py --data_path=$PRED_DATA_PATH/csl_predict.json --save_path=$SUBMIT_DATA_PATH/csl_predict.json
+python $SCRIPT_PATH/iflytek_submit.py --data_path=$PRED_DATA_PATH/iflytek_predict.json --save_path=$SUBMIT_DATA_PATH/iflytek_predict.json
+python $SCRIPT_PATH/ocnli_submit.py --data_path=$PRED_DATA_PATH/ocnli_predict.json --save_path=$SUBMIT_DATA_PATH/ocnli_50k_predict.json
+python $SCRIPT_PATH/tnews_submit.py --data_path=$PRED_DATA_PATH/tnews_predict.json --save_path=$SUBMIT_DATA_PATH/tnews11_predict.json
+python $SCRIPT_PATH/wsc_submit.py --data_path=$PRED_DATA_PATH/wsc_predict.json --save_path=$SUBMIT_DATA_PATH/cluewsc11_predict.json
+
+python $SCRIPT_PATH/cmrc2018_submit.py --data_path=$PRED_DATA_PATH/cmrc2018_predict.json --save_path=$SUBMIT_DATA_PATH/cmrc2018_predict.json
diff --git a/fengshen/examples/clue1.1/predict2submit/afqmc_submit.py b/fengshen/examples/clue1.1/predict2submit/afqmc_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..725a5cc1eecdc7b9d2ed9b359e31b160adc5d611
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/afqmc_submit.py
@@ -0,0 +1,33 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+def submit(file_path):
+    id2label={0:'0',1:'1'}
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            result.append({'id':data['id'],'label':id2label[data['label']]})
+    return result
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+
+    save_data(submit(args.data_path), args.save_path)
+    
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/c3_submit.py b/fengshen/examples/clue1.1/predict2submit/c3_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aae88745ba1783b09dc6a0b40108b0adde3bf2d
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/c3_submit.py
@@ -0,0 +1,32 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+def submit(file_path):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            result.append({'id':data['id'],'label':data['label']})
+    return result
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+
+    save_data(submit(args.data_path), args.save_path)
+    
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/chid_submit.py b/fengshen/examples/clue1.1/predict2submit/chid_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..156bcd7955ebb4e1bc22fdcf1c04364e7094312b
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/chid_submit.py
@@ -0,0 +1,72 @@
+import json
+from tqdm import tqdm
+import argparse
+import numpy as np
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        json_data=json.dumps(data,ensure_ascii=False)
+        f.write(json_data+'\n')
+
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for l,line in tqdm(enumerate(lines)): 
+            data = json.loads(line)
+            result.append(data)
+        return result
+
+
+def recls(line):
+    mat=[]
+    for l in line:
+        s=[v for v in l['score'].values()]
+        mat.append(s)
+    mat=np.array(mat)
+    batch,num_labels=mat.shape
+    for i in range(len(line)):
+        index = np.unravel_index(np.argmax(mat, axis=None), mat.shape)
+        line[index[0]]['label'] = int(index[1])
+        mat[index[0],:] = np.zeros((num_labels,))
+        mat[:,index[1]] = np.zeros((batch,))
+    return line
+
+        
+        
+     
+                                                                                                                                     
+
+def chid_m(data):
+    lines={}
+    for d in data:
+        if d['line_id'] not in lines.keys():
+            lines[d['line_id']]=[]
+        lines[d['line_id']].append(d)
+    result=[]
+    for k,v in lines.items():
+        result.extend(recls(v))
+    return result
+
+
+
+def submit(file_path):
+    lines = chid_m(load_data(file_path))
+    result={}
+    for line in tqdm(lines): 
+        data = line
+        result[data['id']]=data['label']
+    return result
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+    
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/cmrc2018_submit.py b/fengshen/examples/clue1.1/predict2submit/cmrc2018_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..95a89f302d0d77e540ff759cb268be6067483a43
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/cmrc2018_submit.py
@@ -0,0 +1,41 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        json_data=json.dumps(data,ensure_ascii=False)
+        f.write(json_data+'\n')
+
+
+def submit(file_path):
+    id2score={}
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        for line in lines:
+            line = json.loads(line)
+            for choice in line['choices']:
+                if choice['id'] not in id2score.keys():
+                    id2score[choice['id']]=[]
+                id2score[choice['id']].extend(choice['entity_list'])
+        
+    result={}
+    for k,v in id2score.items():
+        if v==[]:
+            result[k]=''
+        else:
+            result[k] = sorted(v, key=lambda k: k['score'],reverse=True)[0]['entity_name']
+    return result
+
+ 
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+     
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/csl_submit.py b/fengshen/examples/clue1.1/predict2submit/csl_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0664d67675a797959ba0fb6883b81bd172de8b06
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/csl_submit.py
@@ -0,0 +1,83 @@
+import json
+from tqdm import tqdm
+import argparse
+import numpy as np
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+def load_data(file_path,is_training=False):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for l,line in tqdm(enumerate(lines)): 
+            data = json.loads(line)
+            result.append(data)
+        return result
+
+
+def recls(line):
+    mat=[]
+    for l in line:
+        s=[v for v in l['score'].values()]
+        mat.append(s)
+    mat=np.array(mat)
+    batch,num_labels=mat.shape
+    for i in range(len(line)):
+        index = np.unravel_index(np.argmax(mat, axis=None), mat.shape)
+        line[index[0]]['label'] = int(index[1])
+        mat[index[0],:] = np.zeros((num_labels,))
+        mat[:,index[1]] = np.zeros((batch,))
+    return line
+
+     
+import copy                                                                                                                                     
+
+def csl_scorted(data):
+    lines={}
+    new_data=copy.deepcopy(data)
+    for d in data:
+        if d['texta'] not in lines.keys():
+            lines[d['texta']]={}
+        lines[d['texta']][d['id']]=d['score'][d['choice'][0]]
+    result=[]
+    id2preds={}
+    for k,v in lines.items():
+        v=sorted(v.items(), key=lambda x: x[1], reverse=True)
+        # print(v)
+        for i,(text_id, score) in enumerate(v):
+            if i<len(v)/2:
+                label=0
+            else:
+                label=1
+            id2preds[text_id]=label
+
+    for d in range(len(new_data)):
+        new_data[d]['label']=id2preds[new_data[d]['id']]
+
+    return new_data
+
+
+def submit(file_path):
+    id2label={1:'0',0:'1'}
+    lines=csl_scorted(load_data(file_path))
+    result=[]
+    for line in tqdm(lines): 
+        data = line
+        result.append({'id':data['id'],'label':str(id2label[data['label']])})
+    return result
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/iflytek_submit.py b/fengshen/examples/clue1.1/predict2submit/iflytek_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c9c8220b01bd1bc5068bb5101be4871d238924
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/iflytek_submit.py
@@ -0,0 +1,160 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+            
+            
+label2id={"打车": "0", "地图导航": "1", "免费WIFI": "2", "租车": "3", "同城服务": "4", "快递物流": "5", "婚庆": "6", "家政": "7", "公共交通": "8", "政务": "9", "社区服务": "10", "薅羊毛": "11", "魔幻": "12", "仙侠": "13", "卡牌": "14", "飞行空战": "15", "射击游戏": "16", "休闲益智": "17", "动作类": "18", "体育竞技": "19", "棋牌中心": "20", "经营养成": "21", "策略": "22", "MOBA": "23", "辅助工具": "24", "约会社交": "25", "即时通讯": "26", "工作社交": "27", "论坛圈子": "28", "婚恋社交": "29", "情侣社交": "30", "社交工具": "31", "生活社交": "32", "微博博客": "33", "新闻": "34", "漫画": "35", "小说": "36", "技术": "37", "教辅": "38", "问答交流": "39", "搞笑": "40", "杂志": "41", "百科": "42", "影视娱乐": "43", "求职": "44", "兼职": "45", "视频": "46", "短视频": "47", "音乐": "48", "直播": "49", "电台": "50", "K歌": "51", "成人": "52", "中小学": "53", "职考": "54", "公务员": "55", "英语": "56", "视频教育": "57", "高等教育": "58", "成人教育": "59", "艺术": "60", "语言(非英语)": "61", "旅游资讯": "62", "综合预定": "63", "民航": "64", "铁路": "65", "酒店": "66", "行程管理": "67", "民宿短租": "68", "出国": "69", "工具": "70", "亲子儿童": "71", "母婴": "72", "驾校": "73", "违章": "74", "汽车咨询": "75", "汽车交易": "76", "日常养车": "77", "行车辅助": "78", "租房": "79", "买房": "80", "装修家居": "81", "电子产品": "82", "问诊挂号": "83", "养生保健": "84", "医疗服务": "85", "减肥瘦身": "86", "美妆美业": "87", "菜谱": "88", "餐饮店": "89", "体育咨讯": "90", "运动健身": "91", "支付": "92", "保险": "93", "股票": "94", "借贷": "95", "理财": "96", "彩票": "97", "记账": "98", "银行": "99", "美颜": "100", "影像剪辑": "101", "摄影修图": "102", "相机": "103", "绘画": "104", "二手": "105", "电商": "106", "团购": "107", "外卖": "108", "电影票务": "109", "社区超市": "110", "购物咨询": "111", "笔记": "112", "办公": "113", "日程管理": "114", "女性": "115", "经营": "116", "收款": "117", "其他": "118"}
+
+label2desc={
+        '银行': '银行',
+        '社区服务': '社区',
+        '电商': '电商',
+        '支付': '支付',
+        '经营养成': '养成',
+        '卡牌': '卡牌',
+        '借贷': '借贷',
+        '驾校': '驾校',
+        '理财': '理财',
+        '职考': '职考',
+        '新闻': '新闻',
+        '旅游资讯': '旅游',
+        '公共交通': '交通',
+        '魔幻': '魔幻',
+        '医疗服务': '医疗',
+        '影像剪辑': '影像',
+        '动作类': '动作',
+        '工具': '工具',
+        '体育竞技': '体育',
+        '小说': '小说',
+        '运动健身': '运动',
+        '相机': '相机',
+        '辅助工具': '辅助',
+        '快递物流': '快递',
+        '高等教育': '教育',
+        '股票': '股票',
+        '菜谱': '菜谱',
+        '行车辅助': '行车',
+        '仙侠': '仙侠',
+        '亲子儿童': '亲子',
+        '购物咨询': '购物',
+        '射击游戏': '射击',
+        '漫画': '漫画',
+        '中小学': '小学',
+        '同城服务': '同城',
+        '成人教育': '成人',
+        '求职': '求职',
+        '电子产品': '电子',
+        '艺术': '艺术',
+        '薅羊毛': '赚钱',
+        '约会社交': '约会',
+        '经营': '经营',
+        '兼职': '兼职',
+        '短视频': '短视',
+        '音乐': '音乐',
+        '英语': '英语',
+        '棋牌中心': '棋牌',
+        '摄影修图': '摄影',
+        '养生保健': '养生',
+        '办公': '办公',
+        '政务': '政务',
+        '视频': '视频',
+        '论坛圈子': '论坛',
+        '彩票': '彩票',
+        '直播': '直播',
+        '其他': '其他',
+        '休闲益智': '休闲',
+        '策略': '策略',
+        '即时通讯': '通讯',
+        '汽车交易': '买车',
+        '违章': '违章',
+        '地图导航': '地图',
+        '民航': '民航',
+        '电台': '电台',
+        '语言(非英语)': '语言',
+        '搞笑': '搞笑',
+        '婚恋社交': '婚恋',
+        '社区超市': '超市',
+        '日常养车': '养车',
+        '杂志': '杂志',
+        '视频教育': '在线',
+        '家政': '家政',
+        '影视娱乐': '影视',
+        '装修家居': '装修',
+        '体育咨讯': '资讯',
+        '社交工具': '社交',
+        '餐饮店': '餐饮',
+        '美颜': '美颜',
+        '问诊挂号': '挂号',
+        '飞行空战': '飞行',
+        '综合预定': '预定',
+        '电影票务': '票务',
+        '笔记': '笔记',
+        '买房': '买房',
+        '外卖': '外卖',
+        '母婴': '母婴',
+        '打车': '打车',
+        '情侣社交': '情侣',
+        '日程管理': '日程',
+        '租车': '租车',
+        '微博博客': '博客',
+        '百科': '百科',
+        '绘画': '绘画',
+        '铁路': '铁路',
+        '生活社交': '生活',
+        '租房': '租房',
+        '酒店': '酒店',
+        '保险': '保险',
+        '问答交流': '问答',
+        '收款': '收款',
+        'MOBA': '竞技',
+        'K歌': '唱歌',
+        '技术': '技术',
+        '减肥瘦身': '减肥',
+        '工作社交': '工作',
+        '团购': '团购',
+        '记账': '记账',
+        '女性': '女性',
+        '公务员': '公务',
+        '二手': '二手',
+        '美妆美业': '美妆',
+        '汽车咨询': '汽车',
+        '行程管理': '行程',
+        '免费WIFI': '免费',
+        '教辅': '教辅',
+        '成人': '两性',
+        '出国': '出国',
+        '婚庆': '婚庆',
+        '民宿短租': '民宿'}
+    
+desc2label={v:k for k,v in label2desc.items()}
+    
+
+
+def submit(file_path):                
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            result.append({'id':data['id'],'label':label2id[desc2label[data['choice'][data['label']]]]})
+    return result
+
+
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/ocnli_submit.py b/fengshen/examples/clue1.1/predict2submit/ocnli_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..89849f49476fdfd2fbde7ce3422ca25f203e5e8c
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/ocnli_submit.py
@@ -0,0 +1,32 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+def submit(file_path):
+    id2label={0:'contradiction',1:'neutral',2:'entailment'}
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            result.append({'id':data['id'],'label':id2label[data['label']]})
+    return result
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+    
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/tnews_submit.py b/fengshen/examples/clue1.1/predict2submit/tnews_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..eada0476b270624af8c397afb7df70e4e24473b3
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/tnews_submit.py
@@ -0,0 +1,47 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+def submit(file_path):
+    id2label={"故事": "100",
+            "文化": "101", 
+            "娱乐": "102",
+            "体育": "103",
+            "财经": "104",
+            "房产": "106", 
+            "汽车": "107",
+            "教育": "108", 
+            "科技": "109", 
+            "军事": "110", 
+            "旅游": "112",
+            "国际": "113", 
+            "股票": "114",
+            "农业": "115",
+            "电竞": "116"}
+    
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            result.append({'id':data['id'],'label':id2label[data['choice'][data['label']]]})
+    return result
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+    
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/predict2submit/wsc_submit.py b/fengshen/examples/clue1.1/predict2submit/wsc_submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec6246e5659c8cfbb5a2563a2976183a9b4c37a6
--- /dev/null
+++ b/fengshen/examples/clue1.1/predict2submit/wsc_submit.py
@@ -0,0 +1,41 @@
+import json
+from tqdm import tqdm
+import argparse
+
+
+def save_data(data,file_path):
+    with open(file_path, 'w', encoding='utf8') as f:
+        for line in data:
+            json_data=json.dumps(line,ensure_ascii=False)
+            f.write(json_data+'\n')
+
+def submit(file_path):
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        result=[]
+        for line in tqdm(lines): 
+            data = json.loads(line)
+            if '不是' in data['choice'][0] and '是' in data['choice'][1]:
+                if data['label']==1:
+                    label='false'
+                else:
+                    label='true'
+            else:
+                if data['label']==0:
+                    label='true'
+                else:
+                    label='false'
+            result.append({'id':data['id'],'label':label})
+    return result
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--data_path", type=str,default="")
+    parser.add_argument("--save_path", type=str,default="")
+
+    args = parser.parse_args()
+    save_data(submit(args.data_path), args.save_path)
+    
+    
+    
\ No newline at end of file
diff --git a/fengshen/examples/clue1.1/run_clue_ubert.sh b/fengshen/examples/clue1.1/run_clue_ubert.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3375f1d5358ac32bc0fb7f3d4c2bc543642ebb06
--- /dev/null
+++ b/fengshen/examples/clue1.1/run_clue_ubert.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=3G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+#SBATCH --requeue
+#SBATCH --qos=preemptive
+
+
+DATA_DIR=./data/cmrc2018  #数据集路径
+
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-Ubert-110M-Chinese
+
+CHECKPOINT_PATH=./checkpoints
+
+LOAD_CHECKPOINT_PATH=./checkpoints/last.ckpt
+
+OUTPUT_PATH=./predict/cmrc2018_predict.json
+
+DEFAULT_ROOT_DIR=./log
+
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data dev.json \
+        --batchsize 32 \
+        --max_length 314 \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 0.00002 \
+        --weight_decay 0.1 \
+        --warmup 0.01 \
+        --num_labels 1 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_span_acc \
+        --save_top_k 5 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only true \
+        --checkpoint_path $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_span_acc:.4f} \
+        "
+
+#--load_checkpoints_path $LOAD_CHECKPOINT_PATH \
+TRAINER_ARGS="\
+        --max_epochs 11 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --gradient_clip_val 0.25 \
+        --val_check_interval 0.05 \
+        --limit_val_batches 100 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_path $OUTPUT_PATH \
+        --threshold 0.001 \
+        --train \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+SCRIPT_PATH=./solution/clue_ubert.py
+python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/clue1.1/run_clue_unimc.sh b/fengshen/examples/clue1.1/run_clue_unimc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d1f576211ddab72bb12a7ef63a7c026754b611b
--- /dev/null
+++ b/fengshen/examples/clue1.1/run_clue_unimc.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH --job-name=slurm-test # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --mem-per-cpu=4G # memory per cpu-core (4G is default)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+
+#SBATCH --requeue
+#SBATCH --qos=preemptive
+
+TASK=tnews #clue 上的任务 ，可选afqmc、tnews、iflytek、wsc、ocnli、csl、chid、c3
+DATA_ROOT_PATH=./data  #数据集路径
+DATA_DIR=$DATA_ROOT_PATH/$TASK
+
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese  #预训练模型的路径
+
+CHECKPOINT_PATH=./checkpoint  #训练模型保存的路径
+
+LOAD_CHECKPOINT_PATH=./checkpoints/last.ckpt  #加载预训练好的模型
+
+OUTPUT_PATH=./predict/${TASK}_predict.json
+
+DEFAULT_ROOT_DIR=./log # 模型日志输出路径
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test1.1.json \
+        --batchsize 1 \
+        --max_length 512 \
+        "
+
+# 如果使用的是 UniMC-DeBERTa-1.4B模型，学习率要设置1e-6
+
+MODEL_ARGS="\
+        --learning_rate 0.000002 \
+        --weight_decay 0.1 \
+        --warmup 0.06 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_ckpt_path $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 17 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --gradient_clip_val 0.25 \
+        --default_root_dir $DEFAULT_ROOT_DIR \
+        "
+
+#--load_checkpoints_path $LOAD_CHECKPOINT_PATH \  如果想加载预训练好的ckpt模型，可以使用这个参数加载
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --output_path $OUTPUT_PATH \
+        --train \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+SCRIPT_PATH=./solution/clue_unimc.py
+python3 $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/clue1.1/solution/clue_ubert.py b/fengshen/examples/clue1.1/solution/clue_ubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b3ed7b5a4eb9dff9dda0d9131ae206d55d1c2f
--- /dev/null
+++ b/fengshen/examples/clue1.1/solution/clue_ubert.py
@@ -0,0 +1,46 @@
+import argparse
+from fengshen import UbertPipelines
+import os
+import json
+from tqdm import tqdm
+
+def load_data(data_path):
+    with open(data_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        samples = [json.loads(line) for line in tqdm(lines)]
+    return samples
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--data_dir', default='./data', type=str)
+    total_parser.add_argument('--train_data', default='train.json', type=str)
+    total_parser.add_argument('--valid_data', default='dev.json', type=str)
+    total_parser.add_argument('--test_data', default='test.json', type=str)
+    total_parser.add_argument('--output_path',default='./predict.json', type=str)
+    
+    total_parser = UbertPipelines.pipelines_args(total_parser)
+    args = total_parser.parse_args()
+
+    train_data = load_data(os.path.join(args.data_dir, args.train_data))
+    dev_data = load_data(os.path.join(args.data_dir, args.valid_data))
+    test_data = load_data(os.path.join(args.data_dir, args.test_data))
+    
+    # test_data = test_data[:10]
+
+    model = UbertPipelines(args)
+    if args.train:
+        model.fit(train_data, dev_data)
+
+    result = model.predict(test_data)
+    for line in result[:20]:
+        print(line)
+
+    with open(args.output_path, 'w', encoding='utf8') as f:
+        for line in result:
+            json_data = json.dumps(line, ensure_ascii=False)
+            f.write(json_data+'\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/clue1.1/solution/clue_unimc.py b/fengshen/examples/clue1.1/solution/clue_unimc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ffe4899e31216326260a65d9d12ad7892fc60f
--- /dev/null
+++ b/fengshen/examples/clue1.1/solution/clue_unimc.py
@@ -0,0 +1,63 @@
+import argparse
+from fengshen.pipelines.multiplechoice import UniMCPipelines
+import os
+import json
+import copy
+from tqdm import tqdm
+
+def load_data(data_path):
+    with open(data_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        samples = [json.loads(line) for line in tqdm(lines)]
+    return samples
+
+
+def comp_acc(pred_data,test_data):
+    corr=0
+    for i in range(len(pred_data)):
+        if pred_data[i]['label']==test_data[i]['label']:
+            corr+=1
+    return corr/len(pred_data)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--data_dir', default='./data', type=str)
+    total_parser.add_argument('--train_data', default='train.json', type=str)
+    total_parser.add_argument('--valid_data', default='dev.json', type=str)
+    total_parser.add_argument('--test_data', default='test.json', type=str)
+    total_parser.add_argument('--output_path', default='', type=str)
+    
+    total_parser = UniMCPipelines.piplines_args(total_parser)
+    args = total_parser.parse_args()
+
+    train_data = load_data(os.path.join(args.data_dir, args.train_data))
+    dev_data = load_data(os.path.join(args.data_dir, args.valid_data))
+    test_data = load_data(os.path.join(args.data_dir, args.test_data))
+
+    # dev_data = dev_data[:200]
+    dev_data_ori=copy.deepcopy(dev_data)
+
+    model = UniMCPipelines(args, args.pretrained_model_path)
+    
+    print(args.data_dir)
+            
+    if args.train:
+        model.train(train_data, dev_data)
+    result = model.predict(dev_data)
+    for line in result[:20]:
+        print(line)
+
+    acc=comp_acc(result,dev_data_ori)
+    print('acc:',acc)
+    
+    if args.output_path != '':
+        test_result = model.predict(test_data)
+        with open(args.output_path, 'w', encoding='utf8') as f:
+            for line in test_result:
+                json_data=json.dumps(line,ensure_ascii=False)
+                f.write(json_data+'\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/clue_sim/README.md b/fengshen/examples/clue_sim/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..41b5b72129491139fa6f21e7cc2ea07d027a60c3
--- /dev/null
+++ b/fengshen/examples/clue_sim/README.md
@@ -0,0 +1,90 @@
+# 二郎神打CLUE语义匹配榜
+  - [比赛介绍](#比赛介绍)
+  - [clue语义匹配榜打榜思路](#clue语义匹配榜-打榜思路)
+  - [数据集介绍](#数据集介绍)
+  - [环境](#环境)
+  - [用法](#用法)
+  - [提交](#提交)
+
+## 比赛介绍
+- clue的语义匹配榜 (https://www.cluebenchmarks.com/sim.html)
+- clue sim官方实例 (https://github.com/CLUEbenchmark/QBQTC)
+
+## clue语义匹配榜 打榜思路
+
+- 直接使用fengshenbang的二郎神模型，就打到了前三。
+- 为了解决标签平衡问题，设计了一个交叉熵平滑滤波loss，就达到了第一。
+
+详细的思路讲解在知乎: <a href="https://zhuanlan.zhihu.com/p/539870077?">链接</a>
+
+## 数据集介绍
+
+QQ浏览器搜索相关性数据集（QBQTC,QQ Browser Query Title Corpus），是QQ浏览器搜索引擎目前针对大搜场景构建的一个融合了相关性、权威性、内容质量、
+时效性等维度标注的学习排序（LTR）数据集，广泛应用在搜索引擎业务场景中。
+
+相关性的含义：0，相关程度差；1，有一定相关性；2，非常相关。数字越大相关性越高。
+
+**数据量统计**
+
+| 训练集（train) | 验证集（dev) | 公开测试集（test_public) | 私有测试集(test) |
+| :----: | :----: | :----: | :----: |
+| 180,000| 20,000| 5,000 | >=10,0000|
+
+**评测指标**
+
+f1_score来自于sklearn.metrics，计算公式如下：
+`F1 =  2 * (precision * recall) / (precision + recall)`
+
+## 环境
+* Python >= 3.6
+* torch == 1.8.0+cu111
+* transforms == 4.6.0
+* pytorch-lightning == 1.3.2
+* 一张GPU: A100 40G
+
+## 用法
+
+fengshenbang的二郎神模型的使用是非常简单的。
+
+该example下的代码和思想继承自<a href="https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/hf-ds/fengshen/examples/classification/finetune_classification.py">fengshen/examples/classification/finetune_classification.py</a>
+
+如果需要直接使用该python脚本，把官方的数据集处理成如下形式：
+
+```json
+{"sentence1": "应届生实习", "sentence2": "实习生招聘-应届生求职网", "label": "1", "id": 0}
+```
+
+然后修改其中的<a href="https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/hf-ds/fengshen/examples/classification/finetune_classification.sh">fengshen/examples/classification/finetune_classification.sh</a>的参数即可。
+
+下面介绍该example的用法：
+
+### 创建文件夹
+
+- dataset 文件夹，下载官方数据集后放进来就行
+- weights 文件夹，用以存放二郎神模型
+- submissions 文件夹，用以存放需要评测的json文件
+
+### Train
+```bash
+python main.py \
+    --mode 'Train' \
+    --model_path './weights/Erlangshen-MegatronBert-1.3B-Similarity' \
+    --model_name 'IDEA-CCNL/Erlangshen-MegatronBert-1.3B-Similarity'
+```
+
+加载最优的模型用以Test set的预测。
+
+### Test
+```bash
+python main.py \
+    --mode 'Test' \
+    --predict_model_path 'your_model_path' \
+    --model_path './weights/Erlangshen-MegatronBert-1.3B-Similarity' \
+    --model_name 'IDEA-CCNL/Erlangshen-MegatronBert-1.3B-Similarity'
+```
+
+## 提交
+
+在路径 ./submissions 下，找到 qbqtc_predict.json 并且提交到<a href="https://www.CLUEbenchmarks.com">测评系统</a>
+
+注意：名字必须为qbqtc_predict.json
diff --git a/fengshen/examples/clue_sim/__init__.py b/fengshen/examples/clue_sim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fengshen/examples/clue_sim/finetune_clue_sim.py b/fengshen/examples/clue_sim/finetune_clue_sim.py
new file mode 100644
index 0000000000000000000000000000000000000000..b05f6ea6ce67c35cd39dedd924df0b663fd5a8b2
--- /dev/null
+++ b/fengshen/examples/clue_sim/finetune_clue_sim.py
@@ -0,0 +1,325 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from sklearn import metrics
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader, ConcatDataset
+import pytorch_lightning as pl
+from collections import defaultdict
+from transformers import AutoConfig, AutoModel, get_cosine_schedule_with_warmup
+from loss import FocalLoss, LabelSmoothingCorrectionCrossEntropy
+
+
+class CustomDataset(Dataset):
+    def __init__(self, file, tokenizer, max_len, mode='no_test'):
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.mode = mode
+
+        self.ex_list = []
+        with open('./dataset/' + file, "r", encoding='utf-8') as f:
+            for line in f:
+                sample = json.loads(line)
+                query = sample["query"]
+                title = sample["title"]
+                id = int(sample["id"])
+                if self.mode == 'no_test':
+                    relevant = int(sample["label"])
+                    self.ex_list.append((query, title, relevant, id))
+                else:
+                    self.ex_list.append((query, title, id))
+
+    def __len__(self):
+        return len(self.ex_list)
+
+    def __getitem__(self, index):
+        if self.mode == 'no_test':
+            query, title, relevant, id = self.ex_list[index]
+        else:
+            query, title, id = self.ex_list[index]
+
+        inputs = self.tokenizer.encode_plus(
+            query, title,
+            truncation=True,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            padding='max_length',
+            return_token_type_ids=True
+        )
+        ids = inputs['input_ids']
+        mask = inputs['attention_mask']
+        token_type_ids = inputs["token_type_ids"]
+        if self.mode == 'no_test':
+            return {
+                'ids': torch.tensor(ids, dtype=torch.long),
+                'mask': torch.tensor(mask, dtype=torch.long),
+                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+                'targets': torch.tensor(relevant, dtype=torch.float),
+                'id': torch.tensor(id, dtype=torch.long)
+            }
+        else:
+            return {
+                'ids': torch.tensor(ids, dtype=torch.long),
+                'mask': torch.tensor(mask, dtype=torch.long),
+                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+                'id': torch.tensor(id, dtype=torch.long)
+            }
+
+
+class CustomDataModule(pl.LightningDataModule):
+    def __init__(self, args, tokenizer):
+        super().__init__()
+        self.args = args
+        self.tokenizer = tokenizer
+        self.max_len = self.args.max_seq_length
+        self.train_dataset = None
+        self.val_dataset = None
+
+    def setup(self, stage):
+        data_path = "./dataset"
+        assert os.path.exists(os.path.join(data_path, 'train.json'))
+        assert os.path.exists(os.path.join(data_path, 'dev.json'))
+        assert os.path.exists(os.path.join(data_path, 'test_public.json'))
+        if stage == 'fit':
+            self.train_dataset = CustomDataset('train.json', self.tokenizer, self.max_len)
+            self.val_dataset = CustomDataset('dev.json', self.tokenizer, self.max_len)
+            self.test_dataset = CustomDataset('test_public.json', self.tokenizer, self.max_len)
+        elif stage == 'test':
+            self.test_dataset = CustomDataset('test_public.json', self.tokenizer, self.max_len)
+
+    def train_dataloader(self):
+        full_dataset = ConcatDataset([self.train_dataset, self.val_dataset])
+        train_dataloader = DataLoader(
+            full_dataset,
+            batch_size=self.args.batch_size,
+            num_workers=4,
+            shuffle=True,
+            pin_memory=True,
+            drop_last=True)
+        return train_dataloader
+
+    def val_dataloader(self):
+        val_dataloader = DataLoader(
+            self.test_dataset,
+            batch_size=self.args.val_batch_size,
+            num_workers=4,
+            shuffle=False,
+            pin_memory=True,
+            drop_last=False)
+        return val_dataloader
+
+    def test_dataloader(self):
+        test_dataloader = DataLoader(
+            self.test_dataset,
+            batch_size=self.args.val_batch_size,
+            num_workers=4,
+            shuffle=False,
+            pin_memory=True,
+            drop_last=False)
+        return test_dataloader
+
+
+class CustomModel(pl.LightningModule):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.model = self.args.model_name
+        self.cache_dir = self.args.model_path
+        self.scheduler = self.args.scheduler
+        self.step_scheduler_after = "batch"
+        self.optimizer = self.args.optimizer
+        self.pooler = self.args.use_original_pooler
+        self.category = self.args.cate_performance
+        self.loss_func = self.args.loss_function
+
+        hidden_dropout_prob: float = 0.1
+        layer_norm_eps: float = 1e-7
+
+        config = AutoConfig.from_pretrained(self.model, cache_dir=self.cache_dir)
+
+        config.update(
+            {
+                "output_hidden_states": False,
+                "hidden_dropout_prob": hidden_dropout_prob,
+                "layer_norm_eps": layer_norm_eps,
+            }
+        )
+        self.transformer = AutoModel.from_pretrained(self.model, config=config, cache_dir=self.cache_dir)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.linear = torch.nn.Linear(config.hidden_size, self.args.num_labels, bias=True)  # 分三类
+
+    def configure_optimizers(self):
+        """Prepare optimizer and schedule"""
+        model = self.transformer
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": 0.01,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+
+        optimizer_index = ['Adam', 'AdamW'].index(self.optimizer)
+        optimizer = [
+            torch.optim.Adam(optimizer_grouped_parameters, lr=self.args.learning_rate),
+            torch.optim.AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate)][optimizer_index]
+
+        scheduler_index = ['StepLR', 'CosineWarmup', 'CosineAnnealingLR'].index(self.scheduler)
+        scheduler = [
+            torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.args.warmup_step,
+                                            gamma=self.args.warmup_proportion),
+            get_cosine_schedule_with_warmup(
+                optimizer,
+                num_warmup_steps=int(self.args.warmup_proportion * self.total_steps),
+                num_training_steps=self.total_steps,
+            ),
+            torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=2e-06)][scheduler_index]
+
+        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
+        return [optimizer], [scheduler]
+
+    def setup(self, stage=None):
+        if stage != "fit":
+            return
+        # calculate total steps
+        train_dataloader = self.trainer.datamodule.train_dataloader()
+        gpus = 0 if self.trainer.gpus is None else self.trainer.gpus
+        tb_size = self.args.batch_size * max(1, gpus)
+        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
+        self.total_steps = (len(train_dataloader.dataset) // tb_size) // ab_size
+
+    def loss(self, outputs, targets):
+        lossf_index = ['CE', 'Focal', 'LSCE_correction'].index(self.loss_func)
+        loss_fct = [nn.CrossEntropyLoss(), FocalLoss(), LabelSmoothingCorrectionCrossEntropy()][lossf_index]
+        loss = loss_fct(outputs, targets)
+        return loss
+
+    def category_performance_measure(self, labels_right, labels_pred, num_label=3):
+        text_labels = [i for i in range(num_label)]
+
+        TP = dict.fromkeys(text_labels, 0)  # 预测正确的各个类的数目
+        TP_FP = dict.fromkeys(text_labels, 0)  # 测试数据集中各个类的数目
+        TP_FN = dict.fromkeys(text_labels, 0)  # 预测结果中各个类的数目
+
+        label_dict = defaultdict(list)
+        for num in range(num_label):
+            label_dict[num].append(str(num))
+
+        # 计算TP等数量
+        for i in range(0, len(labels_right)):
+            TP_FP[labels_right[i]] += 1
+            TP_FN[labels_pred[i]] += 1
+            if labels_right[i] == labels_pred[i]:
+                TP[labels_right[i]] += 1
+
+        # 计算准确率P，召回率R，F1值
+        results = []
+        for key in TP_FP:
+            P = float(TP[key]) / float(TP_FP[key] + 1e-9)
+            R = float(TP[key]) / float(TP_FN[key] + 1e-9)
+            F1 = P * R * 2 / (P + R) if (P + R) != 0 else 0
+            # results.append("%s:\t P:%f\t R:%f\t F1:%f" % (key, P, R, F1))
+            results.append(F1)
+        return results
+
+    def monitor_metrics(self, outputs, targets):
+        pred = torch.argmax(outputs, dim=1).cpu().numpy().tolist()
+        targets = targets.int().cpu().numpy().tolist()
+        if self.category:
+            category_results = self.category_performance_measure(
+                labels_right=targets,
+                labels_pred=pred,
+                num_label=self.args.num_labels
+            )
+            return {"f1": category_results}
+        else:
+            f1_score = metrics.f1_score(targets, pred, average="macro")
+            return {"f1": f1_score}
+
+    def forward(self, ids, mask, token_type_ids, labels):
+        transformer_out = self.transformer(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids)
+
+        if self.pooler:
+            pooler_output = transformer_out.pooler_output
+        else:
+            sequence_output = transformer_out.last_hidden_state
+            pooler_output = torch.mean(sequence_output, dim=1)
+        logits = self.linear(self.dropout(pooler_output))
+
+        labels_hat = torch.argmax(logits, dim=1)
+        correct_count = torch.sum(labels == labels_hat)
+        return logits, correct_count
+
+    def predict(self, ids, mask, token_type_ids):
+        transformer_out = self.transformer(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids)
+        pooler_output = transformer_out.pooler_output
+        logits = self.linear(self.dropout(pooler_output))
+        logits = torch.argmax(logits, dim=1)
+        return logits
+
+    def training_step(self, batch, batch_idx):
+        ids, mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['targets']
+        logits, correct_count = self.forward(ids, mask, token_type_ids, labels)
+        loss = self.loss(logits, labels.long())
+        f1 = self.monitor_metrics(logits, labels)["f1"]
+        self.log("train_loss", loss, logger=True, prog_bar=True)
+        self.log('train_acc', correct_count.float() / len(labels), logger=True, prog_bar=True)
+        if self.category:
+            self.log("train_f1_key0", f1[0], logger=True, prog_bar=True)
+            self.log("train_f1_key1", f1[1], logger=True, prog_bar=True)
+            self.log("train_f1_key2", f1[2], logger=True, prog_bar=True)
+        else:
+            self.log("train_f1", f1, logger=True, prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        ids, mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['targets']
+        logits, correct_count = self.forward(ids, mask, token_type_ids, labels)
+        loss = self.loss(logits, labels.long())
+        f1 = self.monitor_metrics(logits, labels)["f1"]
+        self.log("val_loss", loss, logger=True, prog_bar=True)
+        self.log("val_acc", correct_count.float() / len(labels), logger=True, prog_bar=True)
+        if self.category:
+            self.log("val_f1_key0", f1[0], logger=True, prog_bar=True)
+            self.log("val_f1_key1", f1[1], logger=True, prog_bar=True)
+            self.log("val_f1_key2", f1[2], logger=True, prog_bar=True)
+        else:
+            self.log("val_f1", f1, logger=True, prog_bar=True)
+
+    def test_step(self, batch, batch_idx):
+        ids, mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['targets']
+        logits, correct_count = self.forward(ids, mask, token_type_ids, labels)
+        loss = self.loss(logits, labels.long())
+        f1 = self.monitor_metrics(logits, labels)["f1"]
+        self.log("test_loss", loss, logger=True, prog_bar=True)
+        self.log("test_acc", correct_count.float() / len(labels), logger=True, prog_bar=True)
+        if self.category:
+            self.log("test_f1_key0", f1[0], logger=True, prog_bar=True)
+            self.log("test_f1_key1", f1[1], logger=True, prog_bar=True)
+            self.log("test_f1_key2", f1[2], logger=True, prog_bar=True)
+        else:
+            self.log("test_f1", f1, logger=True, prog_bar=True)
+        return {"test_loss": loss, "logits": logits, "labels": labels}
+
+    def predict_step(self, batch, batch_idx, dataloader_idx):
+        ids, mask, token_type_ids, id = batch['ids'], batch['mask'], batch['token_type_ids'], batch['id']
+        logits = self.predict(ids, mask, token_type_ids)
+        return {'id': id.cpu().numpy().tolist(), 'logits': logits.cpu().numpy().tolist()}
diff --git a/fengshen/examples/clue_sim/loss.py b/fengshen/examples/clue_sim/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..537e2347f65aa952b0eb852c23a39901b0fef52e
--- /dev/null
+++ b/fengshen/examples/clue_sim/loss.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.nn import functional as F
+
+
+class FocalLoss(torch.nn.Module):
+    """Multi-class Focal loss implementation"""
+
+    def __init__(self, gamma=2, weight=None, ignore_index=-100):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.weight = weight
+        self.ignore_index = ignore_index
+
+    def forward(self, input, target):
+        """
+        input: [N, C]
+        target: [N, ]
+        """
+        logpt = F.log_softmax(input, dim=1)
+        pt = torch.exp(logpt)
+        logpt = (1-pt)**self.gamma * logpt
+        loss = F.nll_loss(logpt, target, self.weight, ignore_index=self.ignore_index)
+        return loss
+
+# 交叉熵平滑滤波 防止过拟合
+
+
+class LabelSmoothingCorrectionCrossEntropy(torch.nn.Module):
+    def __init__(self, eps=0.1, reduction='mean', ignore_index=-100):
+        super(LabelSmoothingCorrectionCrossEntropy, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+
+    def forward(self, output, target):
+        c = output.size()[-1]
+        log_preds = F.log_softmax(output, dim=-1)
+        if self.reduction == 'sum':
+            loss = -log_preds.sum()
+        else:
+            loss = -log_preds.sum(dim=-1)
+            if self.reduction == 'mean':
+                loss = loss.mean()
+
+        # task specific
+        labels_hat = torch.argmax(output, dim=1)
+        lt_sum = labels_hat + target
+        abs_lt_sub = abs(labels_hat - target)
+        correction_loss = 0
+        for i in range(c):
+            if lt_sum[i] == 0:
+                pass
+            elif lt_sum[i] == 1:
+                if abs_lt_sub[i] == 1:
+                    pass
+                else:
+                    correction_loss -= self.eps*(0.5945275813408382)
+            else:
+                correction_loss += self.eps*(1/0.32447699714575207)
+        correction_loss /= c
+        # print(correction_loss)
+        return loss*self.eps/c + (1-self.eps) * \
+            F.nll_loss(log_preds, target, reduction=self.reduction, ignore_index=self.ignore_index) + correction_loss
diff --git a/fengshen/examples/clue_sim/main.py b/fengshen/examples/clue_sim/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..91c5a732d8cb1a683aa34a3b3f7c158861cd4492
--- /dev/null
+++ b/fengshen/examples/clue_sim/main.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import jsonlines
+import torch
+import pytorch_lightning as pl
+from transformers import AutoTokenizer, BertTokenizer
+from train_func import CustomDataset, CustomDataModule, CustomModel
+import argparse
+import os
+import gpustat
+
+if __name__ == '__main__':
+    my_parser = argparse.ArgumentParser()
+    my_parser.add_argument(
+        "--model_path", default="./weights/Erlangshen-MegatronBert-1.3B-Similarity", type=str, required=False)
+    my_parser.add_argument(
+        "--model_name", default="IDEA-CCNL/Erlangshen-MegatronBert-1.3B-Similarity", type=str, required=False)
+    my_parser.add_argument("--max_seq_length", default=64, type=int, required=False)
+    my_parser.add_argument("--batch_size", default=32, type=int, required=False)
+    my_parser.add_argument("--val_batch_size", default=64, type=int, required=False)
+    my_parser.add_argument("--num_epochs", default=10, type=int, required=False)
+    my_parser.add_argument("--learning_rate", default=4e-5, type=float, required=False)
+    my_parser.add_argument("--warmup_proportion", default=0.2, type=int, required=False)
+    my_parser.add_argument("--warmup_step", default=2, type=int, required=False)
+    my_parser.add_argument("--num_labels", default=3, type=int, required=False)
+    my_parser.add_argument("--cate_performance", default=False, type=bool, required=False)
+    my_parser.add_argument("--use_original_pooler", default=True, type=bool, required=False)
+    my_parser.add_argument("--model_output_path", default='./pl_model', type=str, required=False)
+    my_parser.add_argument("--mode", type=str, choices=['Train', 'Test'], required=True)
+    my_parser.add_argument("--predict_model_path", default='./pl_model/', type=str, required=False)
+    my_parser.add_argument("--test_output_path", default='./submissions', type=str, required=False)
+    my_parser.add_argument("--optimizer", default='AdamW', type=str, required=False)  # ['Adam', 'AdamW']
+    # ['StepLR', 'CosineWarmup', 'CosineAnnealingLR']
+    my_parser.add_argument("--scheduler", default='CosineWarmup', type=str, required=False)
+    my_parser.add_argument("--loss_function", default='LSCE_correction', type=str,
+                           required=False)  # ['CE', 'Focal', 'LSCE_correction']
+
+    args = my_parser.parse_args()
+
+    print(args)
+    gpustat.print_gpustat()
+
+    if 'Erlangshen' in args.model_name:
+        tokenizer = BertTokenizer.from_pretrained(args.model_name, cache_dir=args.model_path)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.model_path)
+
+    seed = 1919
+    pl.seed_everything(seed)
+
+    dm = CustomDataModule(
+        args=args,
+        tokenizer=tokenizer,
+    )
+
+    metric_index = 2
+    checkpoint = pl.callbacks.ModelCheckpoint(
+        save_top_k=1,
+        verbose=True,
+        monitor=['val_loss', 'val_acc', 'val_f1'][metric_index],
+        mode=['min', 'max', 'max'][metric_index]
+    )
+
+    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
+    callbacks = [checkpoint, lr_monitor]
+
+    logger = pl.loggers.TensorBoardLogger(save_dir=os.getcwd(),
+                                          name='lightning_logs/' + args.model_name.split('/')[-1]),
+
+    trainer = pl.Trainer(
+        progress_bar_refresh_rate=50,
+        logger=logger,
+        gpus=-1 if torch.cuda.is_available() else None,
+        amp_backend='native',
+        amp_level='O2',
+        precision=16,
+        callbacks=callbacks,
+        gradient_clip_val=1.0,
+        max_epochs=args.num_epochs,
+        # accelerator='ddp',
+        # plugins='ddp_sharded',
+    )
+
+    if args.mode == 'Train':
+        print('Only Train')
+        model = CustomModel(
+            args=args,
+        )
+        trainer.fit(model, dm)
+
+    # Predict test, save results to json
+    if args.mode == 'Test':
+        print('Only Test')
+        test_loader = torch.utils.data.DataLoader(
+            CustomDataset('test.json', tokenizer, args.max_seq_length, 'test'),
+            batch_size=args.val_batch_size,
+            num_workers=4,
+            shuffle=False,
+            pin_memory=True,
+            drop_last=False
+        )
+
+        model = CustomModel(args=args).load_from_checkpoint(args.predict_model_path, args=args)
+
+        predict_results = trainer.predict(model, test_loader, return_predictions=True)
+
+        path = os.path.join(
+            args.test_output_path,
+            args.model_name.split('/')[-1].replace('-', '_'))
+        file_path = os.path.join(path, 'qbqtc_predict.json')
+
+        if not os.path.exists(path):
+            os.makedirs(path)
+        if os.path.exists(file_path):
+            print('Json文件已存在, 将用本次结果替换')
+
+        with jsonlines.open(file_path, 'w') as jsonf:
+            for predict_res in predict_results:
+                for i, p in zip(predict_res['id'], predict_res['logits']):
+                    jsonf.write({"id": i, "label": str(p)})
+        print('Json saved:', file_path)
diff --git a/fengshen/examples/deepVAE/pretrain_deep_vae.py b/fengshen/examples/deepVAE/pretrain_deep_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..37884261d487b6d43e1c682f15b7fde6e3beb709
--- /dev/null
+++ b/fengshen/examples/deepVAE/pretrain_deep_vae.py
@@ -0,0 +1,141 @@
+import torch
+import os
+import random
+import math
+import argparse
+from fengshen.data.fs_datasets.fs_datamodule import FSDataModule
+from fengshen.example.deepVAE.vae_pl_module import DeepVAEModule
+
+from pytorch_lightning import (
+    Trainer,
+    loggers,
+)
+
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+from torch.nn.utils.rnn import pad_sequence
+
+
+class NER_RE_Collator:
+    def __init__(self, bos_token, eos_token, sep_token) -> None:
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.sep_token = sep_token
+
+    def __call__(self, samples, max_len=128):
+        # when len(samples) is larger than one, we need to save the sentence length info
+        inputs_tensors, entity_tensors = [], []
+        for sp in samples:
+            # NOTE: in TD-VAE, both encoder and decoder are gpt2, thus use decoder sent twice !
+            input_entities, input_ids = sp['decoder_entities'], sp['decoder_target']
+            input_entities = input_entities[:max_len] + [self.sep_token]
+            # shorten input_ids, based on the fact that sentence must be longer than the entities
+            input_ids = [self.bos_token] + input_ids[:max_len] + [self.eos_token]
+            entity_tensors.append(torch.tensor(input_entities, dtype=torch.long))
+            inputs_tensors.append(torch.tensor(input_ids, dtype=torch.long))
+        if not inputs_tensors or not entity_tensors:
+            return None  # if all the examples in the batch exceed max_length sentence
+        inputs_tensors = pad_sequence(inputs_tensors, batch_first=True, padding_value=0)
+        entity_tensors = pad_sequence(entity_tensors, batch_first=True, padding_value=0)
+        return inputs_tensors, entity_tensors
+
+
+class TDVAECollator:
+    def __init__(self, bos_token, eos_token) -> None:
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+
+    def __call__(self, samples, max_len=120):
+        # when len(samples) is larger than one, we need to save the sentence length info
+        inputs = []
+        for sp in samples:
+            # NOTE: in TD-VAE, both encoder and decoder are gpt2, thus use decoder sent twice !
+            sent_lengths, input_ids = sp['decoder_sent_lengths'], sp['decoder_target']
+            potential_indices = [idx for idx, slen in enumerate(sent_lengths) if slen < max_len]
+            if len(potential_indices) == 0:
+                continue  # we ignore paragraphs with only one sentence split
+            selected_idx = random.choice(potential_indices)
+            start_pos, end_pos = sum(sent_lengths[:selected_idx]), sum(sent_lengths[:selected_idx])+sent_lengths[selected_idx]
+            selected_input_ids = [self.bos_token] + input_ids[start_pos:end_pos] + [self.eos_token]
+            inputs.append(torch.tensor(selected_input_ids, dtype=torch.long))
+        if not inputs:
+            return None  # if all the examples in the batch exceed max_length sentence
+        inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
+        return inputs
+
+
+class ZH_Fin_Collator:
+    def __init__(self, bos_token, eos_token) -> None:
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+
+    def __call__(self, samples, max_len=120):
+        inputs = []
+        for sp in samples:
+            # NOTE: in TD-VAE, both encoder and decoder are gpt2, thus use decoder sent twice !
+            input_ids = sp['input_ids']
+            if len(input_ids) == 0:
+                continue  # we ignore paragraphs with empty string
+            selected_input_ids = [self.bos_token] + input_ids + [self.eos_token]
+            inputs.append(torch.tensor(selected_input_ids, dtype=torch.long))
+        if not inputs:
+            return None
+        inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
+        return inputs
+
+
+class VAEModelCheckpoint:
+    @ staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='total_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./log/', type=str)
+        parser.add_argument('--filename', default='model-{epoch:2d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=-1, type=int)
+        parser.add_argument('--every_n_train_steps', default=1000, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    @staticmethod
+    def get_callback(args):
+        return ModelCheckpoint(monitor=args.monitor,
+                               save_top_k=args.save_top_k,
+                               mode=args.mode,
+                               every_n_train_steps=args.every_n_train_steps,
+                               save_weights_only=args.save_weights_only,
+                               dirpath=args.dirpath,
+                               filename=args.filename)
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+
+    args_parser = FSDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = DeepVAEModule.add_module_specific_args(args_parser)
+    args_parser = VAEModelCheckpoint.add_argparse_args(args_parser)
+
+    args = args_parser.parse_args()
+    # TODO: update this to be tokenizer specific
+    # collator = NER_RE_Collator(bos_token=21128, eos_token=21129, sep_token=102)
+    # collator = TDVAECollator(bos_token=21128, eos_token=21129)
+    collator = ZH_Fin_Collator(bos_token=21128, eos_token=21129)
+
+    data_module = FSDataModule(args=args, collate_fn=collator)
+
+    train_steps = math.ceil(len(data_module.train_dataset)*args.max_epochs /
+                            args.train_batchsize / args.num_nodes / args.gpus)
+    model = DeepVAEModule(args, train_steps)
+
+    logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+        args.default_root_dir, 'logs/'), name='deepvae_lightning')
+
+    save_cpt_callback = VAEModelCheckpoint.get_callback(args)
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[save_cpt_callback, lr_monitor],
+                                         logger=logger)
+    trainer.fit(model, data_module)
diff --git a/fengshen/examples/deepVAE/pretrain_deep_vae.sh b/fengshen/examples/deepVAE/pretrain_deep_vae.sh
new file mode 100644
index 0000000000000000000000000000000000000000..29967a73689777dd2240bd5916c843f62913b5e3
--- /dev/null
+++ b/fengshen/examples/deepVAE/pretrain_deep_vae.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+#SBATCH --job-name=deep_vae_pretrain
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=32 #
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH -o xxx/outputs/deep_vae/logs/slurm/%x-%j.log
+#SBATCH -e xxx/outputs/deep_vae/logs/slurm/%x-%j.err
+# SBATCH --requeue
+# SBATCH --qos=preemptive
+
+set -x -e
+
+ulimit -s unlimited
+echo "START TIME: $(date)"
+
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+# export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=$[RANDOM%10000+50000]
+
+MICRO_BATCH_SIZE=64
+ZERO_STAGE=0
+
+ROOT_PATH=xxxx
+config_json=${ROOT_PATH}/job_out/ds_config.json
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-5,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-5
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=~/tmp
+
+# NOTE both encoder and decoder use the same model
+GPT2_MODEL_PATH=xxx
+VAE_ARGS="
+    --gpt2_model_path $GPT2_MODEL_PATH \
+    --latent_dim 32 \
+    --beta_kl_constraints_start 1e-5 \
+    --beta_kl_constraints_stop 1. \
+    --beta_n_cycles 40 \
+"
+
+
+CHECKPOINT_SAVE_PATH=${ROOT_PATH}/checkpoints
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_recon_loss \
+        --save_top_k 1 \
+        --mode min \
+        --every_n_train_steps 1000 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_SAVE_PATH \
+        --filename checkpoint-{epoch}-{step}-filenum_20_dim_32_beta_1e-5_1_zh_finance \
+        "
+
+TRAINER_ARGS="
+    --max_epochs 40 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --precision 16 \
+    --val_check_interval 1000 \
+    --learning_rate 5e-5 \
+    --warmup_steps 10000 \
+    --weight_decay 0.01 \
+    --default_root_dir ${ROOT_PATH} \
+    --log_every_n_steps 50 \
+    --strategy deepspeed_stage_2 \
+"
+# --strategy deepspeed_stage_2 \
+
+# note we use wudao optimus instead of recreating a deepVAE dataset
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --eval_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --num_workers 32 \
+    --ds_name zh_finance 
+"
+# --ds_name wudao_tdvae, ner_re_data, zh_finance
+# --CVAE
+SCRIPTS_PATH=xxx/fengshen/examples/pretrain_vae
+
+export CMD=" \
+    $SCRIPTS_PATH/pretrain_deep_vae.py \
+    $TRAINER_ARGS \
+    $MODEL_CHECKPOINT_ARGS \
+    $VAE_ARGS \
+    $DATA_ARGS \
+    "
+# srun python $CMD
+# python -m debugpy --listen 5678 --wait-for-client $CMD
+python $CMD
\ No newline at end of file
diff --git a/fengshen/examples/deepVAE/vae_pl_module.py b/fengshen/examples/deepVAE/vae_pl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a7ebdf52983f5266cf446b2c9c83c994f7a4f7
--- /dev/null
+++ b/fengshen/examples/deepVAE/vae_pl_module.py
@@ -0,0 +1,278 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Della model. """
+
+import os
+import torch
+import numpy as np
+from fengshen.models.deepVAE.deep_vae import DeepVAE
+from pytorch_lightning.core.lightning import LightningModule
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.bert.tokenization_bert import BertTokenizer
+from fengshen.models.deepVAE.latent_connector import GPT2ForDecoderLatentConnector, GPT2ForEncoderLatentConnector
+from transformers.optimization import AdamW, get_linear_schedule_with_warmup
+
+
+class DeepVAEModule(LightningModule):
+    @classmethod
+    def add_module_specific_args(cls, parser):
+        group = parser.add_argument_group('vae', 'configurations')
+        group.add_argument("--checkpoint_path", type=str, default=None)
+        group.add_argument("--gpt2_model_path", type=str)
+        group.add_argument("--beta_kl_constraints_start", default=1, type=float,
+                           help="min beta for all the latent z posterior vs prior kl loss")
+        group.add_argument("--beta_kl_constraints_stop", default=1, type=float,
+                           help="max beta for all the latent z posterior vs prior kl loss")
+        group.add_argument("--beta_n_cycles", default=30, type=int,
+                           help="number of cycles for kl loss ratio within an epoch")
+        group.add_argument("--freebit_kl_constraints", default=.1, type=float,
+                           help="free bit for all the latent z kl loss")
+        group.add_argument("--latent_dim", default=256, type=int,
+                           help="latent dimension of deepVAE Z")
+        group.add_argument("--learning_rate", default=5e-5, type=float,
+                           help="The initial learning rate for Adam.")
+        group.add_argument("--weight_decay", default=0.0, type=float,
+                           help="Weight deay if we apply some.")
+        group.add_argument("--adam_epsilon", default=1e-8, type=float,
+                           help="Epsilon for Adam optimizer.")
+        group.add_argument("--max_grad_norm", default=1.0, type=float,
+                           help="Max gradient norm.")
+        group.add_argument("--warmup_steps", default=0, type=int,
+                           help="Linear warmup over warmup_steps.")
+        group.add_argument("--CVAE", action='store_true',
+                           help="specify this argument if finetuning CVAE, otherwise ignore this argument")
+
+        return parser
+
+    @classmethod
+    def load_model(cls, args, labels_dict=None):
+        checkpoint = torch.load(os.path.join(args.checkpoint_path, 'mp_rank_00_model_states.pt'))
+
+        latent_dim = checkpoint['latent_dim'] if ('latent_dim' in checkpoint.keys()) else args.latent_dim
+        labels_dict = checkpoint['label_dict'] if ('label_dict' in checkpoint.keys()) else labels_dict
+
+        enc_config = GPT2Config.from_pretrained(args.gpt2_model_path)
+        tokenizer = BertTokenizer.from_pretrained(args.gpt2_model_path)
+        special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>'}
+        # special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'additional_special_tokens': ['<ENT>', '<ENS>']}
+        tokenizer.add_special_tokens(special_tokens_dict)
+        encoder_model = GPT2ForEncoderLatentConnector(config=enc_config)
+        encoder_model.resize_token_embeddings(len(tokenizer))
+
+        dec_config = GPT2Config.from_pretrained(args.gpt2_model_path)
+        decoder_model = GPT2ForDecoderLatentConnector(config=dec_config, latent_dim=latent_dim)
+        decoder_model.resize_token_embeddings(len(tokenizer))
+
+        vae_model = DeepVAE(encoder_model, decoder_model, latent_dim=latent_dim,
+                            hidden_dim=enc_config.hidden_size, layer_num=enc_config.num_hidden_layers,
+                            pad_token_id=tokenizer.pad_token_id, unk_token_id=tokenizer.unk_token_id,
+                            bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id,
+                            CVAE=args.CVAE)
+
+        # TODO: all the related params should be loaded here! Including latent_nets, posterior_nets, prior_nets, pooling, decoder.transformer.Wv, decoder.transformer.Wz
+        anchor = 'module.model.'
+        start = len(anchor)
+        vae_dict = {key[start:]: val for key, val in checkpoint['module'].items() if anchor in key}
+        # comment out if not initialized from VAE
+        # if args.CVAE:
+        #     # manually load prior and posterior if initialize CVAE model for the first time because of dim mismatch
+        #     prior_post_dict = {key: vae_dict.pop(key) for key in list(vae_dict) if ('posterior_nets' in key or 'prior_nets' in key)}
+        #     for idx in range(enc_config.num_hidden_layers):
+        #         vae_model.posterior_nets[idx].weight.data[:, enc_config.hidden_size:] = prior_post_dict[f"posterior_nets.{idx}.weight"]
+        #         vae_model.prior_nets[idx].weight.data[:, enc_config.hidden_size:] = prior_post_dict[f"prior_nets.{idx}.weight"]
+        #     enc_wte_shape, dec_wte_shape  = vae_dict['encoder.transformer.wte.weight'].shape[0], vae_dict['decoder.transformer.wte.weight'].shape[0]
+        #     vae_model.encoder.transformer.wte.weight.data[:enc_wte_shape, :] = vae_dict.pop('encoder.transformer.wte.weight')
+        #     vae_model.decoder.transformer.wte.weight.data[:dec_wte_shape, :] = vae_dict.pop('decoder.transformer.wte.weight')
+        #     vae_model.decoder.lm_head.weight.data[:dec_wte_shape, :] = vae_dict.pop('decoder.lm_head.weight')
+        missing_keys, unexpected_keys = vae_model.load_state_dict(vae_dict, strict=False)
+        print(f"Vae model loading process: missing keys {missing_keys}, unexpected keys {unexpected_keys}")
+
+        return vae_model, tokenizer
+
+    def __init__(
+        self,
+        args,
+        train_steps=0,
+        labels_dict=None
+    ):
+        super().__init__()
+        # self.save_hyperparameters()
+        self.args = args
+
+        if args.checkpoint_path is not None:
+            self.model, self.encoder_tokenizer, self.decoder_tokenizer, self.latent_dim, \
+                self.labels_dict, self.args = DeepVAEModule.load_model(self.args, labels_dict=labels_dict)
+        else:
+            self.encoder_tokenizer = BertTokenizer.from_pretrained(self.args.encoder_model_path)
+            encoder_config = GPT2Config.from_pretrained(self.args.encoder_model_path)
+            special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'additional_special_tokens': ['<ENT>', '<ENS>']}
+            self.encoder_tokenizer.add_special_tokens(special_tokens_dict)
+            self.latent_dim = self.args.latent_dim
+            encoder = GPT2ForEncoderLatentConnector.from_pretrained(self.args.encoder_model_path, config=encoder_config)
+            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+            encoder.resize_token_embeddings(len(self.encoder_tokenizer))
+
+            self.decoder_tokenizer = BertTokenizer.from_pretrained(self.args.decoder_model_path)
+            self.decoder_tokenizer.add_special_tokens(special_tokens_dict)
+            decoder_config = GPT2Config.from_pretrained(self.args.decoder_model_path)
+            self.labels_dict = labels_dict
+            decoder = GPT2ForDecoderLatentConnector.from_pretrained(self.args.decoder_model_path, config=decoder_config,
+                                                                    latent_dim=self.latent_dim)
+
+            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+            decoder.resize_token_embeddings(len(self.decoder_tokenizer))
+            self.model = DeepVAE(encoder, decoder, latent_dim=self.args.latent_dim,
+                                 hidden_dim=encoder_config.hidden_size, layer_num=encoder_config.num_hidden_layers,
+                                 pad_token_id=self.decoder_tokenizer.pad_token_id, unk_token_id=self.decoder_tokenizer.unk_token_id,
+                                 bos_token_id=self.decoder_tokenizer.bos_token_id, eos_token_id=self.decoder_tokenizer.eos_token_id,
+                                 CVAE=args.CVAE)
+
+        self.train_steps = train_steps
+        # TODO: adjust the cyclic schedule
+        self.beta_kl_constraints_list = self.get_cyclic_linear_beta_list(self.train_steps,
+                                                                         start=args.beta_kl_constraints_start, stop=args.beta_kl_constraints_stop,  n_cycle=args.beta_n_cycles)
+        # self.mlm_probability_list = self.get_decoder_beta_list(self.train_steps,
+        #     start=0., stop=1.,  n_cycle=args.beta_n_cycles)
+        # self.beta_kl_constraints_list = self.get_constant_ratio(self.train_steps, args.beta_kl_constraints)
+        self.mlm_probability_list = self.get_constant_ratio(self.train_steps, 0.)
+        # self.freebit_kl_constraints = args.freebit_kl_constraints
+
+    def get_constant_ratio(self, n_steps, ratio):
+        L = np.ones(n_steps)
+        L *= ratio
+        return L
+
+    def get_decoder_beta_list(self, n_steps, start=0., stop=1.0, n_cycle=4):
+        L = np.ones(n_steps)
+        t_range = int(n_steps / n_cycle)
+        for t_cur in range(n_steps):
+            if t_cur > t_range:
+                L[t_cur] = 0.
+            else:
+                ratio = t_cur / t_range
+                value = stop - ratio * (stop-start)
+                L[t_cur] = value
+        return L
+
+    def get_cyclic_linear_beta_list(self, n_steps, start=0.5, stop=1.0, n_cycle=4):
+        L = np.ones(n_steps)
+        t_range = int(n_steps / n_cycle)
+        for t_cur in range(n_steps):
+            loc = t_cur % t_range
+            split_range = int(t_range * 0.25)
+            if loc <= 2*split_range:
+                value = start
+            elif loc <= 3*split_range:
+                ratio = (loc % split_range) / split_range
+                value = ratio * (stop-start)
+            else:
+                value = stop
+            L[t_cur] = value
+        return L
+
+    #####
+    # Torch lightning
+    #####
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        checkpoint['label_dict'] = self.labels_dict
+        checkpoint['latent_dim'] = self.latent_dim
+
+    def training_step(self, batch, batch_idx):
+        if batch is None:
+            loss = torch.Tensor([0.]).to(next(self.model.parameters()).device)
+            loss.requires_grad = True
+            return loss
+        inputs, cond_inputs = batch, None
+        if self.args.CVAE:
+            inputs, cond_inputs = batch
+
+        total_loss, rec_loss, total_kl_loss, layer_kl_loss = \
+            self.model(inputs, self.beta_kl_constraints_list[batch_idx], cond_inputs)
+        # the logging interval are set by the trainer_args log_every_n_steps
+        for idx, pg in enumerate(self.optimizers().param_groups):
+            self.log(f"learning_rate_{idx}", pg['lr'])
+        unscaled_kl_constraint_loss = 0. if self.beta_kl_constraints_list[batch_idx] == 0. else total_kl_loss/self.beta_kl_constraints_list[batch_idx]
+        self.log("total_loss", total_loss)
+        self.log("total_kl_constraint_loss", total_kl_loss)
+        self.log("unscaled_kl_constraint_loss", unscaled_kl_constraint_loss)
+        self.log("beta_kl_constraints", self.beta_kl_constraints_list[batch_idx])
+        self.log("beta_mlm_probability", self.mlm_probability_list[batch_idx])
+        self.log("rec_loss", rec_loss)
+        for idx, kl_loss in enumerate(layer_kl_loss):
+            self.log(f"layer_{idx}_kl_loss", kl_loss.mean())
+
+        return total_loss
+
+    def training_step_end(self, batch_parts):
+        pass
+
+    def training_epoch_end(self, outputs):
+        pass
+
+    def validation_step(self, batch, batch_idx):
+        if batch is None:
+            loss = torch.Tensor([0.]).to(next(self.model.parameters()).device)
+            loss.requires_grad = True
+            return loss
+        inputs, cond_inputs = batch, None
+        if self.args.CVAE:
+            inputs, cond_inputs = batch
+
+        total_loss, rec_loss, total_kl_loss, layer_kl_loss = self.model(inputs, 1., cond_inputs)
+        # the logging interval are set by the trainer_args log_every_n_steps
+        self.log("val_total_loss", total_loss)
+        self.log("val_kl_constraint_loss", total_kl_loss)
+        self.log("val_recon_loss", rec_loss)
+        for idx, kl_loss in enumerate(layer_kl_loss):
+            self.log(f"layer_{idx}_kl_loss", kl_loss.mean())
+        return total_loss
+
+    def validation_epoch_end(self, outputs):
+        pass
+
+    def test_step(self, batch, batch_idx):
+        if batch is None:
+            loss = torch.Tensor([0.]).to(next(self.model.parameters()).device)
+            loss.requires_grad = True
+            return loss
+        inputs, cond_inputs = batch, None
+        if self.args.CVAE:
+            inputs, cond_inputs = batch
+        total_loss, rec_loss, total_kl_loss, layer_kl_loss = self.model(inputs, 1., cond_inputs)
+        self.log("test_total_loss", total_loss)
+        self.log("test_recon_loss", rec_loss)
+        self.log("test_kl_constraint_loss", total_kl_loss)
+        for idx, kl_loss in enumerate(layer_kl_loss):
+            self.log(f"layer_{idx}_kl_loss", kl_loss.mean())
+        return total_loss
+
+    def configure_optimizers(self):
+        no_decay = ['bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay},
+            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.train_steps)
+
+        return {'optimizer': optimizer,
+                'lr_scheduler': {
+                    'scheduler': scheduler,
+                    'interval': 'step',
+                    'frequency': 1
+                }
+                }
diff --git a/fengshen/examples/disco_project/README.md b/fengshen/examples/disco_project/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c8d95f886e1d80fd1e198eb8d0618c77b6f8836d
--- /dev/null
+++ b/fengshen/examples/disco_project/README.md
@@ -0,0 +1,18 @@
+# Chinese Warp For Disco Diffusion
+- This is a chinese version disco diffusion. We train a Chinese CLIP [IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese) and utilize it to guide the diffusion process. 
+- This code is modified from https://github.com/alembics/disco-diffusion
+- streamlit demo is supported.
+- the checkpoint has been upload to hugging face.
+## Usage
+
+- Install the lack package directly
+### Run Directly 
+```
+python disco.py --prompt 夕阳西下 --model_path IDEA-CCNL/Taiyi-Diffusion-532M-Nature # or IDEA-CCNL/Taiyi-Diffusion-532M-Cyberpunk
+```
+
+### Streamlit Setup
+```
+streamlit run st_disco.py
+# --server.port=xxxx --server.address=xxxx
+```
diff --git a/fengshen/examples/disco_project/disco.py b/fengshen/examples/disco_project/disco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c8b516f13311c9797ea27fa6410361a5dfa715a
--- /dev/null
+++ b/fengshen/examples/disco_project/disco.py
@@ -0,0 +1,735 @@
+import os
+import sys
+# sys.path.insert(0, f'{PROJECT_DIR}/guided-diffusion')   # 加在前面，不再读取库文件的东西。
+import subprocess
+import io
+import torch.nn as nn
+from torch.nn import functional as F
+import torch
+import torchvision.transforms.functional as TF
+import torchvision.transforms as T
+import math
+import requests
+import cv2
+from resize_right import resize
+from guided_diffusion.guided_diffusion.script_util import model_and_diffusion_defaults
+from types import SimpleNamespace
+from PIL import Image
+import argparse
+from guided_diffusion.guided_diffusion.unet import HFUNetModel
+from tqdm.notebook import tqdm
+from datetime import datetime
+from guided_diffusion.guided_diffusion.script_util import create_model_and_diffusion
+import clip
+from transformers import BertForSequenceClassification, BertTokenizer
+import gc
+import random
+
+
+# ======================== GLOBAL SETTING ========================
+PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+useCPU = False  # @param {type:"boolean"}
+skip_augs = False  # @param{type: 'boolean'}
+perlin_init = False  # @param{type: 'boolean'}
+
+use_secondary_model = False
+diffusion_model = "custom"
+
+# Dimensions must by multiples of 64.
+side_x = 512
+side_y = 512
+
+diffusion_sampling_mode = 'ddim'  # @param ['plms','ddim']
+use_checkpoint = True  # @param {type: 'boolean'}
+ViTB32 = False  # @param{type:"boolean"}
+ViTB16 = False  # @param{type:"boolean"}
+ViTL14 = True  # @param{type:"boolean"}
+ViTL14_336px = False  # @param{type:"boolean"}
+RN101 = False  # @param{type:"boolean"}
+RN50 = False  # @param{type:"boolean"}
+RN50x4 = False  # @param{type:"boolean"}
+RN50x16 = False  # @param{type:"boolean"}
+RN50x64 = False  # @param{type:"boolean"}
+
+
+# @markdown #####**OpenCLIP settings:**
+ViTB32_laion2b_e16 = False  # @param{type:"boolean"}
+ViTB32_laion400m_e31 = False  # @param{type:"boolean"}
+ViTB32_laion400m_32 = False  # @param{type:"boolean"}
+ViTB32quickgelu_laion400m_e31 = False  # @param{type:"boolean"}
+ViTB32quickgelu_laion400m_e32 = False  # @param{type:"boolean"}
+ViTB16_laion400m_e31 = False  # @param{type:"boolean"}
+ViTB16_laion400m_e32 = False  # @param{type:"boolean"}
+RN50_yffcc15m = False  # @param{type:"boolean"}
+RN50_cc12m = False  # @param{type:"boolean"}
+RN50_quickgelu_yfcc15m = False  # @param{type:"boolean"}
+RN50_quickgelu_cc12m = False  # @param{type:"boolean"}
+RN101_yfcc15m = False  # @param{type:"boolean"}
+RN101_quickgelu_yfcc15m = False  # @param{type:"boolean"}
+
+# @markdown ####**Basic Settings:**
+
+# NOTE steps可以改这里，需要重新初始化模型，我懒得改接口了orz
+steps = 100  # @param [25,50,100,150,250,500,1000]{type: 'raw', allow-input: true}
+tv_scale = 0  # @param{type: 'number'}
+range_scale = 150  # @param{type: 'number'}
+sat_scale = 0  # @param{type: 'number'}
+cutn_batches = 1  # @param{type: 'number'}  # NOTE 这里会对图片做数据增强，累计计算n次CLIP的梯度，以此作为guidance。
+skip_augs = False  # @param{type: 'boolean'}
+# @markdown ####**Saving:**
+
+intermediate_saves = 0  # @param{type: 'raw'}
+intermediates_in_subfolder = True  # @param{type: 'boolean'}
+
+# perlin_init = False  # @param{type: 'boolean'}
+perlin_mode = 'mixed'  # @param ['mixed', 'color', 'gray']
+set_seed = 'random_seed'  # @param{type: 'string'}
+eta = 0.8  # @param{type: 'number'}
+clamp_grad = True  # @param{type: 'boolean'}
+clamp_max = 0.05  # @param{type: 'number'}
+
+# EXTRA ADVANCED SETTINGS:
+randomize_class = True
+clip_denoised = False
+fuzzy_prompt = False
+rand_mag = 0.05
+
+# @markdown ---
+cut_overview = "[12]*400+[4]*600"  # @param {type: 'string'}
+cut_innercut = "[4]*400+[12]*600"  # @param {type: 'string'}
+cut_ic_pow = "[1]*1000"  # @param {type: 'string'}
+cut_icgray_p = "[0.2]*400+[0]*600"  # @param {type: 'string'}
+
+
+# @markdown ####**Transformation Settings:**
+use_vertical_symmetry = False  # @param {type:"boolean"}
+use_horizontal_symmetry = False  # @param {type:"boolean"}
+transformation_percent = [0.09]  # @param
+
+display_rate = 3  # @param{type: 'number'}
+n_batches = 1  # @param{type: 'number'}
+
+# @markdown If you're having issues with model downloads, check this to compare SHA's:
+check_model_SHA = False  # @param{type:"boolean"}
+interp_spline = 'Linear'  # Do not change, currently will not look good. param ['Linear','Quadratic','Cubic']{type:"string"}
+resume_run = False
+batch_size = 1
+
+
+def createPath(filepath):
+    os.makedirs(filepath, exist_ok=True)
+
+
+def wget(url, outputdir):
+    res = subprocess.run(['wget', url, '-P', f'{outputdir}'], stdout=subprocess.PIPE).stdout.decode('utf-8')
+    print(res)
+
+
+def alpha_sigma_to_t(alpha, sigma):
+    return torch.atan2(sigma, alpha) * 2 / math.pi
+
+
+def interp(t):
+    return 3 * t**2 - 2 * t ** 3
+
+
+def perlin(width, height, scale=10, device=None):
+    gx, gy = torch.randn(2, width + 1, height + 1, 1, 1, device=device)
+    xs = torch.linspace(0, 1, scale + 1)[:-1, None].to(device)
+    ys = torch.linspace(0, 1, scale + 1)[None, :-1].to(device)
+    wx = 1 - interp(xs)
+    wy = 1 - interp(ys)
+    dots = 0
+    dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys)
+    dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys)
+    dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys))
+    dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys))
+    return dots.permute(0, 2, 1, 3).contiguous().view(width * scale, height * scale)
+
+
+def perlin_ms(octaves, width, height, grayscale, device=None):
+    out_array = [0.5] if grayscale else [0.5, 0.5, 0.5]
+    # out_array = [0.0] if grayscale else [0.0, 0.0, 0.0]
+    for i in range(1 if grayscale else 3):
+        scale = 2 ** len(octaves)
+        oct_width = width
+        oct_height = height
+        for oct in octaves:
+            p = perlin(oct_width, oct_height, scale, device)
+            out_array[i] += p * oct
+            scale //= 2
+            oct_width *= 2
+            oct_height *= 2
+    return torch.cat(out_array)
+
+
+def fetch(url_or_path):
+    if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):
+        r = requests.get(url_or_path)
+        r.raise_for_status()
+        fd = io.BytesIO()
+        fd.write(r.content)
+        fd.seek(0)
+        return fd
+    return open(url_or_path, 'rb')
+
+
+def read_image_workaround(path):
+    """OpenCV reads images as BGR, Pillow saves them as RGB. Work around
+    this incompatibility to avoid colour inversions."""
+    im_tmp = cv2.imread(path)
+    return cv2.cvtColor(im_tmp, cv2.COLOR_BGR2RGB)
+
+
+def parse_prompt(prompt):
+    if prompt.startswith('http://') or prompt.startswith('https://'):
+        vals = prompt.rsplit(':', 2)
+        vals = [vals[0] + ':' + vals[1], *vals[2:]]
+    else:
+        vals = prompt.rsplit(':', 1)
+    vals = vals + ['', '1'][len(vals):]
+    return vals[0], float(vals[1])
+
+
+def sinc(x):
+    return torch.where(x != 0, torch.sin(math.pi * x) / (math.pi * x), x.new_ones([]))
+
+
+def lanczos(x, a):
+    cond = torch.logical_and(-a < x, x < a)
+    out = torch.where(cond, sinc(x) * sinc(x / a), x.new_zeros([]))
+    return out / out.sum()
+
+
+def ramp(ratio, width):
+    n = math.ceil(width / ratio + 1)
+    out = torch.empty([n])
+    cur = 0
+    for i in range(out.shape[0]):
+        out[i] = cur
+        cur += ratio
+    return torch.cat([-out[1:].flip([0]), out])[1:-1]
+
+
+def resample(input, size, align_corners=True):
+    n, c, h, w = input.shape
+    dh, dw = size
+
+    input = input.reshape([n * c, 1, h, w])
+
+    if dh < h:
+        kernel_h = lanczos(ramp(dh / h, 2), 2).to(input.device, input.dtype)
+        pad_h = (kernel_h.shape[0] - 1) // 2
+        input = F.pad(input, (0, 0, pad_h, pad_h), 'reflect')
+        input = F.conv2d(input, kernel_h[None, None, :, None])
+
+    if dw < w:
+        kernel_w = lanczos(ramp(dw / w, 2), 2).to(input.device, input.dtype)
+        pad_w = (kernel_w.shape[0] - 1) // 2
+        input = F.pad(input, (pad_w, pad_w, 0, 0), 'reflect')
+        input = F.conv2d(input, kernel_w[None, None, None, :])
+
+    input = input.reshape([n, c, h, w])
+    return F.interpolate(input, size, mode='bicubic', align_corners=align_corners)
+
+
+class MakeCutouts(nn.Module):
+    def __init__(self, cut_size, cutn, skip_augs=False):
+        super().__init__()
+        self.cut_size = cut_size
+        self.cutn = cutn
+        self.skip_augs = skip_augs
+        self.augs = T.Compose([
+            T.RandomHorizontalFlip(p=0.5),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            T.RandomAffine(degrees=15, translate=(0.1, 0.1)),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            T.RandomPerspective(distortion_scale=0.4, p=0.7),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            T.RandomGrayscale(p=0.15),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            # T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
+        ])
+
+    def forward(self, input):
+        input = T.Pad(input.shape[2] // 4, fill=0)(input)
+        sideY, sideX = input.shape[2:4]
+        max_size = min(sideX, sideY)
+
+        cutouts = []
+        for ch in range(self.cutn):
+            if ch > self.cutn - self.cutn // 4:
+                cutout = input.clone()
+            else:
+                size = int(max_size * torch.zeros(1,).normal_(mean=.8, std=.3).clip(float(self.cut_size / max_size), 1.))
+                offsetx = torch.randint(0, abs(sideX - size + 1), ())
+                offsety = torch.randint(0, abs(sideY - size + 1), ())
+                cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]
+
+            if not self.skip_augs:
+                cutout = self.augs(cutout)
+            cutouts.append(resample(cutout, (self.cut_size, self.cut_size)))
+            del cutout
+
+        cutouts = torch.cat(cutouts, dim=0)
+        return cutouts
+
+
+class MakeCutoutsDango(nn.Module):
+    def __init__(self, cut_size, args,
+                 Overview=4,
+                 InnerCrop=0, IC_Size_Pow=0.5, IC_Grey_P=0.2,
+                 ):
+        super().__init__()
+        self.padargs = {}
+        self.cutout_debug = False
+        self.cut_size = cut_size
+        self.Overview = Overview
+        self.InnerCrop = InnerCrop
+        self.IC_Size_Pow = IC_Size_Pow
+        self.IC_Grey_P = IC_Grey_P
+        self.augs = T.Compose([
+            T.RandomHorizontalFlip(p=0.5),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            T.RandomAffine(degrees=10, translate=(0.05, 0.05), interpolation=T.InterpolationMode.BILINEAR),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            T.RandomGrayscale(p=0.1),
+            T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),
+            T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
+        ])
+
+    def forward(self, input):
+        cutouts = []
+        gray = T.Grayscale(3)
+        sideY, sideX = input.shape[2:4]
+        max_size = min(sideX, sideY)
+        min_size = min(sideX, sideY, self.cut_size)
+        output_shape = [1, 3, self.cut_size, self.cut_size]
+        pad_input = F.pad(input, ((sideY - max_size) // 2, (sideY - max_size) // 2, (sideX - max_size) // 2, (sideX - max_size) // 2), **self.padargs)
+        cutout = resize(pad_input, out_shape=output_shape)
+
+        if self.Overview > 0:
+            if self.Overview <= 4:
+                if self.Overview >= 1:
+                    cutouts.append(cutout)
+                if self.Overview >= 2:
+                    cutouts.append(gray(cutout))
+                if self.Overview >= 3:
+                    cutouts.append(TF.hflip(cutout))
+                if self.Overview == 4:
+                    cutouts.append(gray(TF.hflip(cutout)))
+            else:
+                cutout = resize(pad_input, out_shape=output_shape)
+                for _ in range(self.Overview):
+                    cutouts.append(cutout)
+
+            if self.cutout_debug:
+                # if is_colab:
+                #     TF.to_pil_image(cutouts[0].clamp(0, 1).squeeze(0)).save("/content/cutout_overview0.jpg",quality=99)
+                # else:
+                TF.to_pil_image(cutouts[0].clamp(0, 1).squeeze(0)).save("cutout_overview0.jpg", quality=99)
+
+        if self.InnerCrop > 0:
+            for i in range(self.InnerCrop):
+                size = int(torch.rand([])**self.IC_Size_Pow * (max_size - min_size) + min_size)
+                offsetx = torch.randint(0, sideX - size + 1, ())
+                offsety = torch.randint(0, sideY - size + 1, ())
+                cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]
+                if i <= int(self.IC_Grey_P * self.InnerCrop):
+                    cutout = gray(cutout)
+                cutout = resize(cutout, out_shape=output_shape)
+                cutouts.append(cutout)
+            if self.cutout_debug:
+                # if is_colab:
+                #     TF.to_pil_image(cutouts[-1].clamp(0, 1).squeeze(0)).save("/content/cutout_InnerCrop.jpg",quality=99)
+                # else:
+                TF.to_pil_image(cutouts[-1].clamp(0, 1).squeeze(0)).save("cutout_InnerCrop.jpg", quality=99)
+        cutouts = torch.cat(cutouts)
+        if skip_augs is not True:
+            cutouts = self.augs(cutouts)
+        return cutouts
+
+
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+
+
+def tv_loss(input):
+    """L2 total variation loss, as in Mahendran et al."""
+    input = F.pad(input, (0, 1, 0, 1), 'replicate')
+    x_diff = input[..., :-1, 1:] - input[..., :-1, :-1]
+    y_diff = input[..., 1:, :-1] - input[..., :-1, :-1]
+    return (x_diff**2 + y_diff**2).mean([1, 2, 3])
+
+
+def range_loss(input):
+    return (input - input.clamp(-1, 1)).pow(2).mean([1, 2, 3])
+
+
+def symmetry_transformation_fn(x):
+    # NOTE 强制图像对称
+    use_horizontal_symmetry = False
+    if use_horizontal_symmetry:
+        [n, c, h, w] = x.size()
+        x = torch.concat((x[:, :, :, :w // 2], torch.flip(x[:, :, :, :w // 2], [-1])), -1)
+        print("horizontal symmetry applied")
+    if use_vertical_symmetry:
+        [n, c, h, w] = x.size()
+        x = torch.concat((x[:, :, :h // 2, :], torch.flip(x[:, :, :h // 2, :], [-2])), -2)
+        print("vertical symmetry applied")
+    return x
+
+
+# def split_prompts(prompts):
+#     prompt_series = pd.Series([np.nan for a in range(max_frames)])
+#     for i, prompt in prompts.items():
+#         prompt_series[i] = prompt
+#     # prompt_series = prompt_series.astype(str)
+#     prompt_series = prompt_series.ffill().bfill()
+#     return prompt_series
+
+
+"""
+other chaos settings
+"""
+# dir settings
+
+outDirPath = f'{PROJECT_DIR}/images_out'
+createPath(outDirPath)
+model_path = f'{PROJECT_DIR}/models'
+createPath(model_path)
+
+
+# GPU setup
+DEVICE = torch.device('cuda:0' if (torch.cuda.is_available() and not useCPU) else 'cpu')
+print('Using device:', DEVICE)
+device = DEVICE  # At least one of the modules expects this name..
+if not useCPU:
+    if torch.cuda.get_device_capability(DEVICE) == (8, 0):  # A100 fix thanks to Emad
+        print('Disabling CUDNN for A100 gpu', file=sys.stderr)
+        torch.backends.cudnn.enabled = False
+
+model_config = model_and_diffusion_defaults()
+model_config.update({
+    'attention_resolutions': '32, 16, 8',
+    'class_cond': False,
+    'diffusion_steps': 1000,  # No need to edit this, it is taken care of later.
+    'rescale_timesteps': True,
+    'timestep_respacing': 250,  # No need to edit this, it is taken care of later.
+    'image_size': 512,
+    'learn_sigma': True,
+    'noise_schedule': 'linear',
+    'num_channels': 256,
+    'num_head_channels': 64,
+    'num_res_blocks': 2,
+    'resblock_updown': True,
+    'use_checkpoint': use_checkpoint,
+    'use_fp16': not useCPU,
+    'use_scale_shift_norm': True,
+})
+
+model_default = model_config['image_size']
+normalize = T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
+
+# Make folder for batch
+steps_per_checkpoint = steps + 10
+# Update Model Settings
+timestep_respacing = f'ddim{steps}'
+diffusion_steps = (1000 // steps) * steps if steps < 1000 else steps
+model_config.update({
+    'timestep_respacing': timestep_respacing,
+    'diffusion_steps': diffusion_steps,
+})
+
+
+start_frame = 0
+print('Starting Run:')
+if set_seed == 'random_seed':
+    random.seed()
+    seed = random.randint(0, 2**32)
+    # print(f'Using seed: {seed}')
+else:
+    seed = int(set_seed)
+
+args = {
+    # 'seed': seed,
+    'display_rate': display_rate,
+    'n_batches': n_batches,
+    'batch_size': batch_size,
+    'steps': steps,
+    'diffusion_sampling_mode': diffusion_sampling_mode,
+    # 'width_height': width_height,
+    'tv_scale': tv_scale,
+    'range_scale': range_scale,
+    'sat_scale': sat_scale,
+    'cutn_batches': cutn_batches,
+    # 'side_x': side_x,
+    # 'side_y': side_y,
+    'timestep_respacing': timestep_respacing,
+    'diffusion_steps': diffusion_steps,
+    'cut_overview': eval(cut_overview),
+    'cut_innercut': eval(cut_innercut),
+    'cut_ic_pow': eval(cut_ic_pow),
+    'cut_icgray_p': eval(cut_icgray_p),
+    'intermediate_saves': intermediate_saves,
+    'intermediates_in_subfolder': intermediates_in_subfolder,
+    'steps_per_checkpoint': steps_per_checkpoint,
+    'set_seed': set_seed,
+    'eta': eta,
+    'clamp_grad': clamp_grad,
+    'clamp_max': clamp_max,
+    'skip_augs': skip_augs,
+    'randomize_class': randomize_class,
+    'clip_denoised': clip_denoised,
+    'fuzzy_prompt': fuzzy_prompt,
+    'rand_mag': rand_mag,
+    'use_vertical_symmetry': use_vertical_symmetry,
+    'use_horizontal_symmetry': use_horizontal_symmetry,
+    'transformation_percent': transformation_percent,
+}
+args = SimpleNamespace(**args)
+
+# ======================== GLOBAL SETTING END ========================
+
+
+class Diffuser:
+    def __init__(self, cutom_path='IDEA-CCNL/Taiyi-Diffusion-532M-Nature'):
+        self.model_setup(cutom_path)
+
+    def model_setup(self, custom_path):
+        # LOADING MODEL
+        os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+        print(f'Prepping model...model name: {custom_path}')
+        __, self.diffusion = create_model_and_diffusion(**model_config)
+        self.model = HFUNetModel.from_pretrained(custom_path)
+        # total = get_parameter_num(self.model)
+        # print("Number of parameter: %.2fM" % (total/1e6))
+        # print("Number of parameter: %.2fM" % (total/1024/1024))
+
+        self.model.requires_grad_(False).eval().to(device)
+        for name, param in self.model.named_parameters():
+            if 'qkv' in name or 'norm' in name or 'proj' in name:
+                param.requires_grad_()
+        if model_config['use_fp16']:
+            self.model.convert_to_fp16()
+        print(f'Diffusion_model Loaded {diffusion_model}')
+
+        # NOTE Directly Load The Text Encoder From Hugging Face
+        print('Prepping model...model name: CLIP')
+        self.taiyi_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese")
+        self.taiyi_transformer = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval().to(device)
+        self.clip_models = []
+        if ViTB32:
+            self.clip_models.append(clip.load('ViT-B/32', jit=False)[0].eval().requires_grad_(False).to(device))
+        if ViTB16:
+            self.clip_models.append(clip.load('ViT-B/16', jit=False)[0].eval().requires_grad_(False).to(device))
+        if ViTL14:
+            self.clip_models.append(clip.load('ViT-L/14', jit=False)[0].eval().requires_grad_(False).to(device))
+        if ViTL14_336px:
+            self.clip_models.append(clip.load('ViT-L/14@336px', jit=False)[0].eval().requires_grad_(False).to(device))
+        print('CLIP Loaded')
+        # self.lpips_model = lpips.LPIPS(net='vgg').to(device)
+
+    def generate(self,
+                 input_text_prompts=['夕阳西下'],
+                 init_image=None,
+                 skip_steps=10,
+                 clip_guidance_scale=7500,
+                 init_scale=2000,
+                 st_dynamic_image=None,
+                 seed=None,
+                 side_x=512,
+                 side_y=512,
+                 ):
+
+        seed = seed
+        frame_num = 0
+        init_image = init_image
+        init_scale = init_scale
+        skip_steps = skip_steps
+        loss_values = []
+        # if seed is not None:
+        #     np.random.seed(seed)
+        #     random.seed(seed)
+        #     torch.manual_seed(seed)
+        #     torch.cuda.manual_seed_all(seed)
+        #     torch.backends.cudnn.deterministic = True
+        # target_embeds, weights = [], []
+        frame_prompt = input_text_prompts
+
+        print(f'Frame {frame_num} Prompt: {frame_prompt}')
+
+        model_stats = []
+        for clip_model in self.clip_models:
+            # cutn = 16
+            model_stat = {"clip_model": None, "target_embeds": [], "make_cutouts": None, "weights": []}
+            model_stat["clip_model"] = clip_model
+
+            for prompt in frame_prompt:
+                txt, weight = parse_prompt(prompt)
+                # txt = clip_model.encode_text(clip.tokenize(prompt).to(device)).float()
+                # NOTE use chinese CLIP
+                txt = self.taiyi_transformer(self.taiyi_tokenizer(txt, return_tensors='pt')['input_ids'].to(device)).logits
+                if args.fuzzy_prompt:
+                    for i in range(25):
+                        model_stat["target_embeds"].append((txt + torch.randn(txt.shape).cuda() * args.rand_mag).clamp(0, 1))
+                        model_stat["weights"].append(weight)
+                else:
+                    model_stat["target_embeds"].append(txt)
+                    model_stat["weights"].append(weight)
+
+            model_stat["target_embeds"] = torch.cat(model_stat["target_embeds"])
+            model_stat["weights"] = torch.tensor(model_stat["weights"], device=device)
+            if model_stat["weights"].sum().abs() < 1e-3:
+                raise RuntimeError('The weights must not sum to 0.')
+            model_stat["weights"] /= model_stat["weights"].sum().abs()
+            model_stats.append(model_stat)
+
+        init = None
+        if init_image is not None:
+            # init = Image.open(fetch(init_image)).convert('RGB')   # 传递的是加载好的图片。而非地址~
+            init = init_image
+            init = init.resize((side_x, side_y), Image.LANCZOS)
+            init = TF.to_tensor(init).to(device).unsqueeze(0).mul(2).sub(1)
+
+        cur_t = None
+
+        def cond_fn(x, t, y=None):
+            with torch.enable_grad():
+                x_is_NaN = False
+                x = x.detach().requires_grad_()
+                n = x.shape[0]
+
+                my_t = torch.ones([n], device=device, dtype=torch.long) * cur_t
+                out = self.diffusion.p_mean_variance(self.model, x, my_t, clip_denoised=False, model_kwargs={'y': y})
+                fac = self.diffusion.sqrt_one_minus_alphas_cumprod[cur_t]
+                x_in = out['pred_xstart'] * fac + x * (1 - fac)
+                x_in_grad = torch.zeros_like(x_in)
+
+                for model_stat in model_stats:
+                    for i in range(args.cutn_batches):
+                        t_int = int(t.item()) + 1  # errors on last step without +1, need to find source
+                        # try:
+                        input_resolution = model_stat["clip_model"].visual.input_resolution
+                        # except:
+                        #     input_resolution = 224
+
+                        cuts = MakeCutoutsDango(input_resolution,
+                                                Overview=args.cut_overview[1000 - t_int],
+                                                InnerCrop=args.cut_innercut[1000 - t_int],
+                                                IC_Size_Pow=args.cut_ic_pow[1000 - t_int],
+                                                IC_Grey_P=args.cut_icgray_p[1000 - t_int],
+                                                args=args,
+                                                )
+                        clip_in = normalize(cuts(x_in.add(1).div(2)))
+                        image_embeds = model_stat["clip_model"].encode_image(clip_in).float()
+                        dists = spherical_dist_loss(image_embeds.unsqueeze(1), model_stat["target_embeds"].unsqueeze(0))
+                        dists = dists.view([args.cut_overview[1000 - t_int] + args.cut_innercut[1000 - t_int], n, -1])
+                        losses = dists.mul(model_stat["weights"]).sum(2).mean(0)
+                        loss_values.append(losses.sum().item())  # log loss, probably shouldn't do per cutn_batch
+                        x_in_grad += torch.autograd.grad(losses.sum() * clip_guidance_scale, x_in)[0] / cutn_batches
+                tv_losses = tv_loss(x_in)
+                range_losses = range_loss(out['pred_xstart'])
+                sat_losses = torch.abs(x_in - x_in.clamp(min=-1, max=1)).mean()
+                loss = tv_losses.sum() * tv_scale + range_losses.sum() * range_scale + sat_losses.sum() * sat_scale
+                if init is not None and init_scale:
+                    init_losses = self.lpips_model(x_in, init)
+                    loss = loss + init_losses.sum() * init_scale
+                x_in_grad += torch.autograd.grad(loss, x_in)[0]
+                if not torch.isnan(x_in_grad).any():
+                    grad = -torch.autograd.grad(x_in, x, x_in_grad)[0]
+                else:
+                    x_is_NaN = True
+                    grad = torch.zeros_like(x)
+            if args.clamp_grad and not x_is_NaN:
+                magnitude = grad.square().mean().sqrt()
+                return grad * magnitude.clamp(max=args.clamp_max) / magnitude  # min=-0.02, min=-clamp_max,
+            return grad
+
+        if args.diffusion_sampling_mode == 'ddim':
+            sample_fn = self.diffusion.ddim_sample_loop_progressive
+        else:
+            sample_fn = self.diffusion.plms_sample_loop_progressive
+
+        for i in range(args.n_batches):
+            current_time = datetime.now().strftime('%y%m%d-%H%M%S_%f')
+
+            batchBar = tqdm(range(args.n_batches), desc="Batches")
+            batchBar.n = i
+            batchBar.refresh()
+            gc.collect()
+            torch.cuda.empty_cache()
+            cur_t = self.diffusion.num_timesteps - skip_steps - 1
+            # total_steps = cur_t
+
+            if args.diffusion_sampling_mode == 'ddim':
+                samples = sample_fn(
+                    self.model,
+                    (batch_size, 3, side_y, side_x),
+                    clip_denoised=clip_denoised,
+                    model_kwargs={},
+                    cond_fn=cond_fn,
+                    progress=True,
+                    skip_timesteps=skip_steps,
+                    init_image=init,
+                    randomize_class=randomize_class,
+                    eta=eta,
+                    transformation_fn=symmetry_transformation_fn,
+                    transformation_percent=args.transformation_percent
+                )
+            else:
+                samples = sample_fn(
+                    self.model,
+                    (batch_size, 3, side_y, side_x),
+                    clip_denoised=clip_denoised,
+                    model_kwargs={},
+                    cond_fn=cond_fn,
+                    progress=True,
+                    skip_timesteps=skip_steps,
+                    init_image=init,
+                    randomize_class=randomize_class,
+                    order=2,
+                )
+
+            for j, sample in enumerate(samples):
+                cur_t -= 1
+                intermediateStep = False
+                if args.steps_per_checkpoint is not None:
+                    if j % steps_per_checkpoint == 0 and j > 0:
+                        intermediateStep = True
+                elif j in args.intermediate_saves:
+                    intermediateStep = True
+                if j % args.display_rate == 0 or cur_t == -1 or intermediateStep:
+                    for k, image in enumerate(sample['pred_xstart']):
+                        # tqdm.write(f'Batch {i}, step {j}, output {k}:')
+                        # percent = math.ceil(j / total_steps * 100)
+                        if args.n_batches > 0:
+                            filename = f'{current_time}-{parse_prompt(prompt)[0]}.png'
+                        image = TF.to_pil_image(image.add(1).div(2).clamp(0, 1))
+                        if j % args.display_rate == 0 or cur_t == -1:
+                            image.save(f'{outDirPath}/{filename}')
+                            if st_dynamic_image:
+                                st_dynamic_image.image(image, use_column_width=True)
+                            # self.current_image = image
+        return image
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="setting")
+    parser.add_argument('--prompt', type=str, required=True)
+    parser.add_argument('--text_scale', type=int, default=5000)
+    parser.add_argument('--model_path', type=str, default="IDEA-CCNL/Taiyi-Diffusion-532M-Nature")
+    parser.add_argument('--width', type=int, default=512)
+    parser.add_argument('--height', type=int, default=512)
+
+    user_args = parser.parse_args()
+
+    dd = Diffuser(user_args.model_path)
+    dd.generate([user_args.prompt],
+                clip_guidance_scale=user_args.text_scale,
+                side_x=user_args.width,
+                side_y=user_args.height,
+                )
diff --git a/fengshen/examples/disco_project/guided_diffusion/.gitignore b/fengshen/examples/disco_project/guided_diffusion/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2d97991d7edb176fa2b5829b9f3ceb6405bdb108
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/.gitignore
@@ -0,0 +1,3 @@
+.DS_Store
+__pycache__/
+
diff --git a/fengshen/examples/disco_project/guided_diffusion/LICENSE b/fengshen/examples/disco_project/guided_diffusion/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9e84fcbc4d81a1f433c90caf9f1cef373c12edae
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/fengshen/examples/disco_project/guided_diffusion/__init__.py b/fengshen/examples/disco_project/guided_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/__init__.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9665a0d63f695eab303318d824dad14041c7cde9
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/__init__.py
@@ -0,0 +1,3 @@
+"""
+Codebase for "Improved Denoising Diffusion Probabilistic Models".
+"""
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/fp16_util.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/fp16_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c886705ad4dadb1f0b8b0624cc8f9e8d2dab0c9
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/fp16_util.py
@@ -0,0 +1,236 @@
+"""
+Helpers to train with 16-bit precision.
+"""
+
+import numpy as np
+import torch as th
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from . import logger
+
+INITIAL_LOG_LOSS_SCALE = 20.0
+
+
+def convert_module_to_f16(ll):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        ll.weight.data = ll.weight.data.half()
+        if ll.bias is not None:
+            ll.bias.data = ll.bias.data.half()
+
+
+def convert_module_to_f32(ll):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        ll.weight.data = ll.weight.data.float()
+        if ll.bias is not None:
+            ll.bias.data = ll.bias.data.float()
+
+
+def make_master_params(param_groups_and_shapes):
+    """
+    Copy model parameters into a (differently-shaped) list of full-precision
+    parameters.
+    """
+    master_params = []
+    for param_group, shape in param_groups_and_shapes:
+        master_param = nn.Parameter(
+            _flatten_dense_tensors(
+                [param.detach().float() for (_, param) in param_group]
+            ).view(shape)
+        )
+        master_param.requires_grad = True
+        master_params.append(master_param)
+    return master_params
+
+
+def model_grads_to_master_grads(param_groups_and_shapes, master_params):
+    """
+    Copy the gradients from the model parameters into the master parameters
+    from make_master_params().
+    """
+    for master_param, (param_group, shape) in zip(
+        master_params, param_groups_and_shapes
+    ):
+        master_param.grad = _flatten_dense_tensors(
+            [param_grad_or_zeros(param) for (_, param) in param_group]
+        ).view(shape)
+
+
+def master_params_to_model_params(param_groups_and_shapes, master_params):
+    """
+    Copy the master parameter data back into the model parameters.
+    """
+    # Without copying to a list, if a generator is passed, this will
+    # silently not copy any parameters.
+    for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
+        for (_, param), unflat_master_param in zip(
+            param_group, unflatten_master_params(param_group, master_param.view(-1))
+        ):
+            param.detach().copy_(unflat_master_param)
+
+
+def unflatten_master_params(param_group, master_param):
+    return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
+
+
+def get_param_groups_and_shapes(named_model_params):
+    named_model_params = list(named_model_params)
+    scalar_vector_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim <= 1],
+        (-1),
+    )
+    matrix_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim > 1],
+        (1, -1),
+    )
+    return [scalar_vector_named_params, matrix_named_params]
+
+
+def master_params_to_state_dict(
+    model, param_groups_and_shapes, master_params, use_fp16
+):
+    if use_fp16:
+        state_dict = model.state_dict()
+        for master_param, (param_group, _) in zip(
+            master_params, param_groups_and_shapes
+        ):
+            for (name, _), unflat_master_param in zip(
+                param_group, unflatten_master_params(param_group, master_param.view(-1))
+            ):
+                assert name in state_dict
+                state_dict[name] = unflat_master_param
+    else:
+        state_dict = model.state_dict()
+        for i, (name, _value) in enumerate(model.named_parameters()):
+            assert name in state_dict
+            state_dict[name] = master_params[i]
+    return state_dict
+
+
+def state_dict_to_master_params(model, state_dict, use_fp16):
+    if use_fp16:
+        named_model_params = [
+            (name, state_dict[name]) for name, _ in model.named_parameters()
+        ]
+        param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
+        master_params = make_master_params(param_groups_and_shapes)
+    else:
+        master_params = [state_dict[name] for name, _ in model.named_parameters()]
+    return master_params
+
+
+def zero_master_grads(master_params):
+    for param in master_params:
+        param.grad = None
+
+
+def zero_grad(model_params):
+    for param in model_params:
+        # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
+        if param.grad is not None:
+            param.grad.detach_()
+            param.grad.zero_()
+
+
+def param_grad_or_zeros(param):
+    if param.grad is not None:
+        return param.grad.data.detach()
+    else:
+        return th.zeros_like(param)
+
+
+class MixedPrecisionTrainer:
+    def __init__(
+        self,
+        *,
+        model,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
+    ):
+        self.model = model
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+
+        self.model_params = list(self.model.parameters())
+        self.master_params = self.model_params
+        self.param_groups_and_shapes = None
+        self.lg_loss_scale = initial_lg_loss_scale
+
+        if self.use_fp16:
+            self.param_groups_and_shapes = get_param_groups_and_shapes(
+                self.model.named_parameters()
+            )
+            self.master_params = make_master_params(self.param_groups_and_shapes)
+            self.model.convert_to_fp16()
+
+    def zero_grad(self):
+        zero_grad(self.model_params)
+
+    def backward(self, loss: th.Tensor):
+        if self.use_fp16:
+            loss_scale = 2 ** self.lg_loss_scale
+            (loss * loss_scale).backward()
+        else:
+            loss.backward()
+
+    def optimize(self, opt: th.optim.Optimizer):
+        if self.use_fp16:
+            return self._optimize_fp16(opt)
+        else:
+            return self._optimize_normal(opt)
+
+    def _optimize_fp16(self, opt: th.optim.Optimizer):
+        logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
+        model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
+        grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale)
+        if check_overflow(grad_norm):
+            self.lg_loss_scale -= 1
+            logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
+            zero_master_grads(self.master_params)
+            return False
+
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+
+        self.master_params[0].grad.mul_(1.0 / (2 ** self.lg_loss_scale))
+        opt.step()
+        zero_master_grads(self.master_params)
+        master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
+        self.lg_loss_scale += self.fp16_scale_growth
+        return True
+
+    def _optimize_normal(self, opt: th.optim.Optimizer):
+        grad_norm, param_norm = self._compute_norms()
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        opt.step()
+        return True
+
+    def _compute_norms(self, grad_scale=1.0):
+        grad_norm = 0.0
+        param_norm = 0.0
+        for p in self.master_params:
+            with th.no_grad():
+                param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
+                if p.grad is not None:
+                    grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
+        return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
+
+    def master_params_to_state_dict(self, master_params):
+        return master_params_to_state_dict(
+            self.model, self.param_groups_and_shapes, master_params, self.use_fp16
+        )
+
+    def state_dict_to_master_params(self, state_dict):
+        return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
+
+
+def check_overflow(value):
+    return (value == float("inf")) or (value == -float("inf")) or (value != value)
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/gaussian_diffusion.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/gaussian_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f13385337c0b4ca9f25cb4850eb245904a6443
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/gaussian_diffusion.py
@@ -0,0 +1,1316 @@
+"""
+This code started out as a PyTorch port of Ho et al's diffusion models:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
+
+Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
+"""
+
+import enum
+import math
+
+import numpy as np
+import torch as th
+
+from .nn import mean_flat
+from .losses import normal_kl, discretized_gaussian_log_likelihood
+
+
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+
+
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+
+
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+
+
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+
+    Ported directly from here, and then adapted over time to further experimentation.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    :param model_mean_type: a ModelMeanType determining what the model outputs.
+    :param model_var_type: a ModelVarType determining how variance is output.
+    :param loss_type: a LossType determining the loss function to use.
+    :param rescale_timesteps: if True, pass floating point timesteps into the
+                              model so that they are always scaled like in the
+                              original paper (0 to 1000).
+    """
+
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        rescale_timesteps=False,
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+
+        self.num_timesteps = int(betas.shape[0])
+
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        )
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        )
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(
+            self.log_one_minus_alphas_cumprod, t, x_start.shape
+        )
+        return mean, variance, log_variance
+
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+
+        In other words, sample from q(x_t | x_0).
+
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+
+            q(x_{t-1} | x_t, x_0)
+
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0] == posterior_variance.shape[0] == posterior_log_variance_clipped.shape[0] == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(
+        self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
+    ):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, self._scale_timesteps(t), **model_kwargs)
+
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            if self.model_var_type == ModelVarType.LEARNED:
+                model_log_variance = model_var_values
+                model_variance = th.exp(model_log_variance)
+            else:
+                min_log = _extract_into_tensor(
+                    self.posterior_log_variance_clipped, t, x.shape
+                )
+                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+                # The model_var_values is [-1, 1] for [min_var, max_var].
+                frac = (model_var_values + 1) / 2
+                model_log_variance = frac * max_log + (1 - frac) * min_log
+                model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+
+        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
+            )
+            model_mean = model_output
+        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = process_xstart(model_output)
+            else:
+                pred_xstart = process_xstart(
+                    self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+                )
+            model_mean, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x, t=t
+            )
+        else:
+            raise NotImplementedError(self.model_mean_type)
+
+        assert (
+            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        )
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+        }
+
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        assert x_t.shape == xprev.shape
+        return (  # (xprev - coef2*x_t) / coef1
+            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev - _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t
+        )
+
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * (1000.0 / self.num_timesteps)
+        return t
+
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+        new_mean = (
+            p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        )
+        return new_mean
+
+    def condition_mean_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, p_mean_var, **model_kwargs)
+        new_mean = (
+            p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        )
+        return new_mean
+
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+
+        See condition_mean() for details on cond_fn.
+
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
+            x, self._scale_timesteps(t), **model_kwargs
+        )
+
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(
+            x_start=out["pred_xstart"], x_t=x, t=t
+        )
+        return out
+
+    def condition_score_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+
+        See condition_mean() for details on cond_fn.
+
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
+            x, t, p_mean_var, **model_kwargs
+        )
+
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(
+            x_start=out["pred_xstart"], x_t=x, t=t
+        )
+        return out
+
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(
+                cond_fn, out, x, t, model_kwargs=model_kwargs
+            )
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+
+    def p_sample_with_grad(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        with th.enable_grad():
+            x = x.detach().requires_grad_()
+            out = self.p_mean_variance(
+                model,
+                x,
+                t,
+                clip_denoised=clip_denoised,
+                denoised_fn=denoised_fn,
+                model_kwargs=model_kwargs,
+            )
+            noise = th.randn_like(x)
+            nonzero_mask = (
+                (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+            )  # no noise when t == 0
+            if cond_fn is not None:
+                out["mean"] = self.condition_mean_with_grad(
+                    cond_fn, out, x, t, model_kwargs=model_kwargs
+                )
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"].detach()}
+
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+    ):
+        """
+        Generate samples from the model.
+
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            skip_timesteps=skip_timesteps,
+            init_image=init_image,
+            randomize_class=randomize_class,
+            cond_fn_with_grad=cond_fn_with_grad,
+        ):
+            final = sample
+        return final["sample"]
+
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+
+            indices = tqdm(indices, desc="Steps")
+
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and 'y' in model_kwargs:
+                model_kwargs['y'] = th.randint(low=0, high=model.num_classes,
+                                               size=model_kwargs['y'].shape,
+                                               device=model_kwargs['y'].device)
+            with th.no_grad():
+                sample_fn = self.p_sample_with_grad if cond_fn_with_grad else self.p_sample
+                out = sample_fn(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+        inpainting_mode=False,
+        orig_img=None,
+        mask_inpaint=None,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+
+        Same usage as p_sample().
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        if inpainting_mode:
+            noised_orig_img = th.sqrt(alpha_bar) * orig_img + \
+                th.sqrt(1 - alpha_bar) * th.randn_like(x)
+            # noised_orig_img_pil = TF.to_pil_image(noised_orig_img[0].add(1).div(2).clamp(0, 1))
+            # noised_orig_img_pil.save(f'/content/drive/MyDrive/AI/Disco_Diffusion/images_out/InpaintingTest/inpainting_dump/noised_orig_{t[0].item()}.png')
+            x = (1 - mask_inpaint) * noised_orig_img + mask_inpaint * x
+            # mixed_x = TF.to_pil_image(x[0].add(1).div(2).clamp(0, 1))
+            # mixed_x.save(f'/content/drive/MyDrive/AI/Disco_Diffusion/images_out/InpaintingTest/inpainting_dump/mixed_x_{t[0].item()}.png')
+
+        out_orig = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out_orig, x, t, model_kwargs=model_kwargs)
+        else:
+            out = out_orig
+
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out_orig["pred_xstart"]}
+
+    def ddim_sample_with_grad(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+
+        Same usage as p_sample().
+        """
+        with th.enable_grad():
+            x = x.detach().requires_grad_()
+            out_orig = self.p_mean_variance(
+                model,
+                x,
+                t,
+                clip_denoised=clip_denoised,
+                denoised_fn=denoised_fn,
+                model_kwargs=model_kwargs,
+            )
+            if cond_fn is not None:
+                out = self.condition_score_with_grad(cond_fn, out_orig, x, t,
+                                                     model_kwargs=model_kwargs)
+            else:
+                out = out_orig
+
+        out["pred_xstart"] = out["pred_xstart"].detach()
+
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out_orig["pred_xstart"].detach()}
+
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+
+        # Equation 12. reversed
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        )
+
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+    ):
+        """
+        Generate samples from the model using DDIM.
+
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+            skip_timesteps=skip_timesteps,
+            init_image=init_image,
+            randomize_class=randomize_class,
+            cond_fn_with_grad=cond_fn_with_grad,
+        ):
+            final = sample
+        return final["sample"]
+
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        transformation_fn=None,
+        transformation_percent=[],
+        inpainting_mode=False,
+        mask_inpaint=None,
+        skip_timesteps_orig=None
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+        transformation_steps = [int(len(indices) * (1 - i)) for i in transformation_percent]
+
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices, desc="Steps")
+
+        if inpainting_mode and skip_timesteps_orig is None:
+            skip_timesteps_orig = self.num_timesteps
+
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and 'y' in model_kwargs:
+                model_kwargs['y'] = th.randint(low=0, high=model.num_classes,
+                                               size=model_kwargs['y'].shape,
+                                               device=model_kwargs['y'].device)
+            with th.no_grad():
+                if i in transformation_steps and transformation_fn is not None:
+                    img = transformation_fn(img)
+                sample_fn = self.ddim_sample_with_grad if cond_fn_with_grad else self.ddim_sample
+                if inpainting_mode \
+                        and i >= self.num_timesteps - skip_timesteps_orig \
+                        and not cond_fn_with_grad:
+                    out = sample_fn(
+                        model,
+                        img,
+                        t,
+                        clip_denoised=clip_denoised,
+                        denoised_fn=denoised_fn,
+                        cond_fn=cond_fn,
+                        model_kwargs=model_kwargs,
+                        eta=eta,
+                        inpainting_mode=inpainting_mode,
+                        orig_img=init_image,
+                        mask_inpaint=mask_inpaint,
+                    )
+                else:
+                    out = sample_fn(
+                        model,
+                        img,
+                        t,
+                        clip_denoised=clip_denoised,
+                        denoised_fn=denoised_fn,
+                        cond_fn=cond_fn,
+                        model_kwargs=model_kwargs,
+                        eta=eta,
+                    )
+                yield out
+                img = out["sample"]
+
+    def plms_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        cond_fn_with_grad=False,
+        order=2,
+        old_out=None,
+    ):
+        """
+        Sample x_{t-1} from the model using Pseudo Linear Multistep.
+
+        Same usage as p_sample().
+        """
+        if not int(order) or not 1 <= order <= 4:
+            raise ValueError('order is invalid (should be int from 1-4).')
+
+        def get_model_output(x, t):
+            with th.set_grad_enabled(cond_fn_with_grad and cond_fn is not None):
+                x = x.detach().requires_grad_() if cond_fn_with_grad else x
+                out_orig = self.p_mean_variance(
+                    model,
+                    x,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                )
+                if cond_fn is not None:
+                    if cond_fn_with_grad:
+                        out = self.condition_score_with_grad(cond_fn, out_orig, x, t, model_kwargs=model_kwargs)
+                        x = x.detach()
+                    else:
+                        out = self.condition_score(cond_fn, out_orig, x, t, model_kwargs=model_kwargs)
+                else:
+                    out = out_orig
+
+            # Usually our model outputs epsilon, but we re-derive it
+            # in case we used x_start or x_prev prediction.
+            eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+            return eps, out, out_orig
+
+        # alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        eps, out, out_orig = get_model_output(x, t)
+
+        if order > 1 and old_out is None:
+            # Pseudo Improved Euler
+            old_eps = [eps]
+            mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev) * eps
+            eps_2, _, _ = get_model_output(mean_pred, t - 1)
+            eps_prime = (eps + eps_2) / 2
+            pred_prime = self._predict_xstart_from_eps(x, t, eps_prime)
+            mean_pred = pred_prime * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev) * eps_prime
+        else:
+            # Pseudo Linear Multistep (Adams-Bashforth)
+            old_eps = old_out["old_eps"]
+            old_eps.append(eps)
+            cur_order = min(order, len(old_eps))
+            if cur_order == 1:
+                eps_prime = old_eps[-1]
+            elif cur_order == 2:
+                eps_prime = (3 * old_eps[-1] - old_eps[-2]) / 2
+            elif cur_order == 3:
+                eps_prime = (23 * old_eps[-1] - 16 * old_eps[-2] + 5 * old_eps[-3]) / 12
+            elif cur_order == 4:
+                eps_prime = (55 * old_eps[-1] - 59 * old_eps[-2] + 37 * old_eps[-3] - 9 * old_eps[-4]) / 24
+            else:
+                raise RuntimeError('cur_order is invalid.')
+            pred_prime = self._predict_xstart_from_eps(x, t, eps_prime)
+            mean_pred = pred_prime * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev) * eps_prime
+
+        if len(old_eps) >= order:
+            old_eps.pop(0)
+
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        sample = mean_pred * nonzero_mask + out["pred_xstart"] * (1 - nonzero_mask)
+
+        return {"sample": sample, "pred_xstart": out_orig["pred_xstart"], "old_eps": old_eps}
+
+    def plms_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        order=2,
+    ):
+        """
+        Generate samples from the model using Pseudo Linear Multistep.
+
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.plms_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            skip_timesteps=skip_timesteps,
+            init_image=init_image,
+            randomize_class=randomize_class,
+            cond_fn_with_grad=cond_fn_with_grad,
+            order=order,
+        ):
+            final = sample
+        return final["sample"]
+
+    def plms_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        order=2,
+    ):
+        """
+        Use PLMS to sample from the model and yield intermediate samples from each
+        timestep of PLMS.
+
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+
+            indices = tqdm(indices, desc="Steps")
+
+        old_out = None
+
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and 'y' in model_kwargs:
+                model_kwargs['y'] = th.randint(low=0, high=model.num_classes,
+                                               size=model_kwargs['y'].shape,
+                                               device=model_kwargs['y'].device)
+            with th.no_grad():
+                out = self.plms_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    cond_fn_with_grad=cond_fn_with_grad,
+                    order=order,
+                    old_out=old_out,
+                )
+                yield out
+                old_out = out
+                img = out["sample"]
+
+    def _vb_terms_bpd(
+        self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+
+        terms = {}
+
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)
+
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+
+        return terms
+
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+
+        This term can't be optimized, as it only depends on the encoder.
+
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/logger.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bdfc7b807ed34ac2334f01b9b09288c488de54e
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/logger.py
@@ -0,0 +1,493 @@
+"""
+Logger copied from OpenAI baselines to avoid extra RL-based dependencies:
+https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py
+"""
+
+import os
+import sys
+import os.path as osp
+import json
+import time
+import datetime
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+
+DEBUG = 10
+INFO = 20
+WARN = 30
+ERROR = 40
+
+DISABLED = 50
+
+
+class KVWriter(object):
+    def writekvs(self, kvs):
+        raise NotImplementedError
+
+
+class SeqWriter(object):
+    def writeseq(self, seq):
+        raise NotImplementedError
+
+
+class HumanOutputFormat(KVWriter, SeqWriter):
+    def __init__(self, filename_or_file):
+        if isinstance(filename_or_file, str):
+            self.file = open(filename_or_file, "wt")
+            self.own_file = True
+        else:
+            assert hasattr(filename_or_file, "read"), (
+                "expected file or str, got %s" % filename_or_file
+            )
+            self.file = filename_or_file
+            self.own_file = False
+
+    def writekvs(self, kvs):
+        # Create strings for printing
+        key2str = {}
+        for (key, val) in sorted(kvs.items()):
+            if hasattr(val, "__float__"):
+                valstr = "%-8.3g" % val
+            else:
+                valstr = str(val)
+            key2str[self._truncate(key)] = self._truncate(valstr)
+
+        # Find max widths
+        if len(key2str) == 0:
+            print("WARNING: tried to write empty key-value dict")
+            return
+        else:
+            keywidth = max(map(len, key2str.keys()))
+            valwidth = max(map(len, key2str.values()))
+
+        # Write out the data
+        dashes = "-" * (keywidth + valwidth + 7)
+        lines = [dashes]
+        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
+            lines.append(
+                "| %s%s | %s%s |"
+                % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val)))
+            )
+        lines.append(dashes)
+        self.file.write("\n".join(lines) + "\n")
+
+        # Flush the output to the file
+        self.file.flush()
+
+    def _truncate(self, s):
+        maxlen = 30
+        return s[: maxlen - 3] + "..." if len(s) > maxlen else s
+
+    def writeseq(self, seq):
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1:  # add space unless this is the last one
+                self.file.write(" ")
+        self.file.write("\n")
+        self.file.flush()
+
+    def close(self):
+        if self.own_file:
+            self.file.close()
+
+
+class JSONOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "wt")
+
+    def writekvs(self, kvs):
+        for k, v in sorted(kvs.items()):
+            if hasattr(v, "dtype"):
+                kvs[k] = float(v)
+        self.file.write(json.dumps(kvs) + "\n")
+        self.file.flush()
+
+    def close(self):
+        self.file.close()
+
+
+class CSVOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "w+t")
+        self.keys = []
+        self.sep = ","
+
+    def writekvs(self, kvs):
+        # Add our current row to the history
+        extra_keys = list(kvs.keys() - self.keys)
+        extra_keys.sort()
+        if extra_keys:
+            self.keys.extend(extra_keys)
+            self.file.seek(0)
+            lines = self.file.readlines()
+            self.file.seek(0)
+            for (i, k) in enumerate(self.keys):
+                if i > 0:
+                    self.file.write(",")
+                self.file.write(k)
+            self.file.write("\n")
+            for line in lines[1:]:
+                self.file.write(line[:-1])
+                self.file.write(self.sep * len(extra_keys))
+                self.file.write("\n")
+        for (i, k) in enumerate(self.keys):
+            if i > 0:
+                self.file.write(",")
+            v = kvs.get(k)
+            if v is not None:
+                self.file.write(str(v))
+        self.file.write("\n")
+        self.file.flush()
+
+    def close(self):
+        self.file.close()
+
+
+class TensorBoardOutputFormat(KVWriter):
+    """
+    Dumps key/value pairs into TensorBoard's numeric format.
+    """
+
+    def __init__(self, dir):
+        os.makedirs(dir, exist_ok=True)
+        self.dir = dir
+        self.step = 1
+        prefix = "events"
+        path = osp.join(osp.abspath(dir), prefix)
+        import tensorflow as tf
+        from tensorflow.python import pywrap_tensorflow
+        from tensorflow.core.util import event_pb2
+        from tensorflow.python.util import compat
+
+        self.tf = tf
+        self.event_pb2 = event_pb2
+        self.pywrap_tensorflow = pywrap_tensorflow
+        self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
+
+    def writekvs(self, kvs):
+        def summary_val(k, v):
+            kwargs = {"tag": k, "simple_value": float(v)}
+            return self.tf.Summary.Value(**kwargs)
+
+        summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
+        event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
+        event.step = (
+            self.step
+        )  # is there any reason why you'd want to specify the step?
+        self.writer.WriteEvent(event)
+        self.writer.Flush()
+        self.step += 1
+
+    def close(self):
+        if self.writer:
+            self.writer.Close()
+            self.writer = None
+
+
+def make_output_format(format, ev_dir, log_suffix=""):
+    os.makedirs(ev_dir, exist_ok=True)
+    if format == "stdout":
+        return HumanOutputFormat(sys.stdout)
+    elif format == "log":
+        return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix))
+    elif format == "json":
+        return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix))
+    elif format == "csv":
+        return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix))
+    elif format == "tensorboard":
+        return TensorBoardOutputFormat(osp.join(ev_dir, "tb%s" % log_suffix))
+    else:
+        raise ValueError("Unknown format specified: %s" % (format,))
+
+
+# ================================================================
+# API
+# ================================================================
+
+
+def logkv(key, val):
+    """
+    Log a value of some diagnostic
+    Call this once for each diagnostic quantity, each iteration
+    If called many times, last value will be used.
+    """
+    get_current().logkv(key, val)
+
+
+def logkv_mean(key, val):
+    """
+    The same as logkv(), but if called many times, values averaged.
+    """
+    get_current().logkv_mean(key, val)
+
+
+def logkvs(d):
+    """
+    Log a dictionary of key-value pairs
+    """
+    for (k, v) in d.items():
+        logkv(k, v)
+
+
+def dumpkvs():
+    """
+    Write all of the diagnostics from the current iteration
+    """
+    return get_current().dumpkvs()
+
+
+def getkvs():
+    return get_current().name2val
+
+
+def log(*args, level=INFO):
+    """
+    Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
+    """
+    get_current().log(*args, level=level)
+
+
+def debug(*args):
+    log(*args, level=DEBUG)
+
+
+def info(*args):
+    log(*args, level=INFO)
+
+
+def warn(*args):
+    log(*args, level=WARN)
+
+
+def error(*args):
+    log(*args, level=ERROR)
+
+
+def set_level(level):
+    """
+    Set logging threshold on current logger.
+    """
+    get_current().set_level(level)
+
+
+def set_comm(comm):
+    get_current().set_comm(comm)
+
+
+def get_dir():
+    """
+    Get directory that log files are being written to.
+    will be None if there is no output directory (i.e., if you didn't call start)
+    """
+    return get_current().get_dir()
+
+
+record_tabular = logkv
+dump_tabular = dumpkvs
+
+
+@contextmanager
+def profile_kv(scopename):
+    logkey = "wait_" + scopename
+    tstart = time.time()
+    try:
+        yield
+    finally:
+        get_current().name2val[logkey] += time.time() - tstart
+
+
+def profile(n):
+    """
+    Usage:
+    @profile("my_func")
+    def my_func(): code
+    """
+
+    def decorator_with_name(func):
+        def func_wrapper(*args, **kwargs):
+            with profile_kv(n):
+                return func(*args, **kwargs)
+
+        return func_wrapper
+
+    return decorator_with_name
+
+
+# ================================================================
+# Backend
+# ================================================================
+
+
+def get_current():
+    if Logger.CURRENT is None:
+        _configure_default_logger()
+
+    return Logger.CURRENT
+
+
+class Logger(object):
+    DEFAULT = None  # A logger with no output files. (See right below class definition)
+    # So that you can still log to the terminal without setting up any output files
+    CURRENT = None  # Current logger being used by the free functions above
+
+    def __init__(self, dir, output_formats, comm=None):
+        self.name2val = defaultdict(float)  # values this iteration
+        self.name2cnt = defaultdict(int)
+        self.level = INFO
+        self.dir = dir
+        self.output_formats = output_formats
+        self.comm = comm
+
+    # Logging API, forwarded
+    # ----------------------------------------
+    def logkv(self, key, val):
+        self.name2val[key] = val
+
+    def logkv_mean(self, key, val):
+        oldval, cnt = self.name2val[key], self.name2cnt[key]
+        self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1)
+        self.name2cnt[key] = cnt + 1
+
+    def dumpkvs(self):
+        if self.comm is None:
+            d = self.name2val
+        else:
+            d = mpi_weighted_mean(
+                self.comm,
+                {
+                    name: (val, self.name2cnt.get(name, 1))
+                    for (name, val) in self.name2val.items()
+                },
+            )
+            if self.comm.rank != 0:
+                d["dummy"] = 1  # so we don't get a warning about empty dict
+        out = d.copy()  # Return the dict for unit testing purposes
+        for fmt in self.output_formats:
+            if isinstance(fmt, KVWriter):
+                fmt.writekvs(d)
+        self.name2val.clear()
+        self.name2cnt.clear()
+        return out
+
+    def log(self, *args, level=INFO):
+        if self.level <= level:
+            self._do_log(args)
+
+    # Configuration
+    # ----------------------------------------
+    def set_level(self, level):
+        self.level = level
+
+    def set_comm(self, comm):
+        self.comm = comm
+
+    def get_dir(self):
+        return self.dir
+
+    def close(self):
+        for fmt in self.output_formats:
+            fmt.close()
+
+    # Misc
+    # ----------------------------------------
+    def _do_log(self, args):
+        for fmt in self.output_formats:
+            if isinstance(fmt, SeqWriter):
+                fmt.writeseq(map(str, args))
+
+
+def get_rank_without_mpi_import():
+    # check environment variables here instead of importing mpi4py
+    # to avoid calling MPI_Init() when this module is imported
+    for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]:
+        if varname in os.environ:
+            return int(os.environ[varname])
+    return 0
+
+
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn(
+                            "WARNING: tried to compute mean on non-float {}={}".format(
+                                name, val
+                            )
+                        )
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name: name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}
+
+
+def configure(dir=None, format_strs=None, comm=None, log_suffix=""):
+    """
+    If comm is provided, average all numerical stats across that comm
+    """
+    if dir is None:
+        dir = os.getenv("OPENAI_LOGDIR")
+    if dir is None:
+        dir = osp.join(
+            tempfile.gettempdir(),
+            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"),
+        )
+    assert isinstance(dir, str)
+    dir = os.path.expanduser(dir)
+    os.makedirs(os.path.expanduser(dir), exist_ok=True)
+
+    rank = get_rank_without_mpi_import()
+    if rank > 0:
+        log_suffix = log_suffix + "-rank%03i" % rank
+
+    if format_strs is None:
+        if rank == 0:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT", "stdout,log,csv").split(",")
+        else:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",")
+    format_strs = filter(None, format_strs)
+    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
+
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
+    if output_formats:
+        log("Logging to %s" % dir)
+
+
+def _configure_default_logger():
+    configure()
+    Logger.DEFAULT = Logger.CURRENT
+
+
+def reset():
+    if Logger.CURRENT is not Logger.DEFAULT:
+        Logger.CURRENT.close()
+        Logger.CURRENT = Logger.DEFAULT
+        log("Reset logger")
+
+
+@contextmanager
+def scoped_configure(dir=None, format_strs=None, comm=None):
+    prevlogger = Logger.CURRENT
+    configure(dir=dir, format_strs=format_strs, comm=comm)
+    try:
+        yield
+    finally:
+        Logger.CURRENT.close()
+        Logger.CURRENT = prevlogger
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/losses.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..162771bf5df31a008b1f4b6b27bd42432955fdc9
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/losses.py
@@ -0,0 +1,73 @@
+"""
+Helpers for various likelihood-based losses. These are ported from the original
+Ho et al. diffusion models codebase:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
+"""
+
+import numpy as np
+
+import torch as th
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    return 0.5 * (
+        -1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+
+
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+
+
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/nn.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b28bd83cf23b4e19868afc2075b11ca1cfbd0e8d
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/nn.py
@@ -0,0 +1,190 @@
+"""
+Various utilities for neural networks.
+"""
+
+import math
+
+import torch as th
+import torch.nn as nn
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * th.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = th.exp(
+        -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(th.autograd.Function):
+    @staticmethod
+    @th.cuda.amp.custom_fwd
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_length = length
+        ctx.save_for_backward(*args)
+        with th.no_grad():
+            output_tensors = ctx.run_function(*args[:length])
+        return output_tensors
+
+    @staticmethod
+    @th.cuda.amp.custom_bwd
+    def backward(ctx, *output_grads):
+        args = list(ctx.saved_tensors)
+
+        # Filter for inputs that require grad. If none, exit early.
+        input_indices = [i for (i, x) in enumerate(args) if x.requires_grad]
+        if not input_indices:
+            return (None, None) + tuple(None for _ in args)
+
+        with th.enable_grad():
+            for i in input_indices:
+                if i < ctx.input_length:
+                    # Not sure why the OAI code does this little
+                    # dance. It might not be necessary.
+                    args[i] = args[i].detach().requires_grad_()
+                    args[i] = args[i].view_as(args[i])
+            output_tensors = ctx.run_function(*args[:ctx.input_length])
+
+        if isinstance(output_tensors, th.Tensor):
+            output_tensors = [output_tensors]
+
+        # Filter for outputs that require grad. If none, exit early.
+        out_and_grads = [(o, g) for (o, g) in zip(output_tensors, output_grads) if o.requires_grad]
+        if not out_and_grads:
+            return (None, None) + tuple(None for _ in args)
+
+        # Compute gradients on the filtered tensors.
+        computed_grads = th.autograd.grad(
+            [o for (o, g) in out_and_grads],
+            [args[i] for i in input_indices],
+            [g for (o, g) in out_and_grads]
+        )
+
+        # Reassemble the complete gradient tuple.
+        input_grads = [None for _ in args]
+        for (i, g) in zip(input_indices, computed_grads):
+            input_grads[i] = g
+        return (None, None) + tuple(input_grads)
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/resample.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..c82eccdcd47c468d41e7cbe02de6a731f2c9bf81
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/resample.py
@@ -0,0 +1,154 @@
+from abc import ABC, abstractmethod
+
+import numpy as np
+import torch as th
+import torch.distributed as dist
+
+
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+
+
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+
+        The weights needn't be normalized, but must be positive.
+        """
+
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+
+
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+
+    def weights(self):
+        return self._weights
+
+
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+
+
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/respace.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/respace.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4ed31dba05d43cba1a262c0a166ab7df10fd9a
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/respace.py
@@ -0,0 +1,128 @@
+import numpy as np
+import torch as th
+
+from .gaussian_diffusion import GaussianDiffusion
+
+
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim"):])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+
+
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
+        )
+
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+
+
+class _WrappedModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/script_util.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/script_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..35af1fa83fc5588bd3a90e1200e13f70c342fcd7
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/script_util.py
@@ -0,0 +1,456 @@
+import argparse
+import inspect
+
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+from .unet import SuperResModel, EncoderUNetModel
+
+NUM_CLASSES = 1000
+
+
+def diffusion_defaults():
+    """
+    Defaults for image and classifier training.
+    """
+    return dict(
+        learn_sigma=False,
+        diffusion_steps=1000,
+        noise_schedule="linear",
+        timestep_respacing="",
+        use_kl=False,
+        predict_xstart=False,
+        rescale_timesteps=False,
+        rescale_learned_sigmas=False,
+    )
+
+
+def classifier_defaults():
+    """
+    Defaults for classifier models.
+    """
+    return dict(
+        image_size=64,
+        classifier_use_fp16=False,
+        classifier_width=128,
+        classifier_depth=2,
+        classifier_attention_resolutions="32,16,8",  # 16
+        classifier_use_scale_shift_norm=True,  # False
+        classifier_resblock_updown=True,  # False
+        classifier_pool="attention",
+    )
+
+
+def model_and_diffusion_defaults():
+    """
+    Defaults for image training.
+    """
+    res = dict(
+        image_size=64,
+        num_channels=128,
+        num_res_blocks=2,
+        num_heads=4,
+        num_heads_upsample=-1,
+        num_head_channels=-1,
+        attention_resolutions="16,8",
+        channel_mult="",
+        dropout=0.0,
+        class_cond=False,
+        use_checkpoint=False,
+        use_scale_shift_norm=True,
+        resblock_updown=False,
+        use_fp16=False,
+        use_new_attention_order=False,
+    )
+    res.update(diffusion_defaults())
+    return res
+
+
+def classifier_and_diffusion_defaults():
+    res = classifier_defaults()
+    res.update(diffusion_defaults())
+    return res
+
+
+def create_model_and_diffusion(
+    image_size,
+    class_cond,
+    learn_sigma,
+    num_channels,
+    num_res_blocks,
+    channel_mult,
+    num_heads,
+    num_head_channels,
+    num_heads_upsample,
+    attention_resolutions,
+    dropout,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+    use_checkpoint,
+    use_scale_shift_norm,
+    resblock_updown,
+    use_fp16,
+    use_new_attention_order,
+):
+    model = create_model(
+        image_size,
+        num_channels,
+        num_res_blocks,
+        channel_mult=channel_mult,
+        learn_sigma=learn_sigma,
+        class_cond=class_cond,
+        use_checkpoint=use_checkpoint,
+        attention_resolutions=attention_resolutions,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        dropout=dropout,
+        resblock_updown=resblock_updown,
+        use_fp16=use_fp16,
+        use_new_attention_order=use_new_attention_order,
+    )
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return model, diffusion
+
+
+def create_model(
+    image_size,
+    num_channels,
+    num_res_blocks,
+    channel_mult="",
+    learn_sigma=False,
+    class_cond=False,
+    use_checkpoint=False,
+    attention_resolutions="16",
+    num_heads=1,
+    num_head_channels=-1,
+    num_heads_upsample=-1,
+    use_scale_shift_norm=False,
+    dropout=0,
+    resblock_updown=False,
+    use_fp16=False,
+    use_new_attention_order=False,
+):
+    if channel_mult == "":
+        if image_size == 512:
+            channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
+        elif image_size == 256:
+            channel_mult = (1, 1, 2, 2, 4, 4)
+        elif image_size == 128:
+            channel_mult = (1, 1, 2, 3, 4)
+        elif image_size == 64:
+            channel_mult = (1, 2, 3, 4)
+        else:
+            raise ValueError(f"unsupported image size: {image_size}")
+    else:
+        channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(","))
+
+    attention_ds = []
+    for res in attention_resolutions.split(","):
+        attention_ds.append(image_size // int(res))
+
+    # config = UNetConfig()
+    # return HFUNetModel(config=config)
+    return None
+
+    # return UNetModel(
+    #     image_size=image_size,
+    #     in_channels=3,
+    #     model_channels=num_channels,
+    #     out_channels=(3 if not learn_sigma else 6),
+    #     num_res_blocks=num_res_blocks,
+    #     attention_resolutions=tuple(attention_ds),
+    #     dropout=dropout,
+    #     channel_mult=channel_mult,
+    #     num_classes=(NUM_CLASSES if class_cond else None),
+    #     use_checkpoint=use_checkpoint,
+    #     use_fp16=use_fp16,
+    #     num_heads=num_heads,
+    #     num_head_channels=num_head_channels,
+    #     num_heads_upsample=num_heads_upsample,
+    #     use_scale_shift_norm=use_scale_shift_norm,
+    #     resblock_updown=resblock_updown,
+    #     use_new_attention_order=use_new_attention_order,
+    # )
+
+
+def create_classifier_and_diffusion(
+    image_size,
+    classifier_use_fp16,
+    classifier_width,
+    classifier_depth,
+    classifier_attention_resolutions,
+    classifier_use_scale_shift_norm,
+    classifier_resblock_updown,
+    classifier_pool,
+    learn_sigma,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+):
+    classifier = create_classifier(
+        image_size,
+        classifier_use_fp16,
+        classifier_width,
+        classifier_depth,
+        classifier_attention_resolutions,
+        classifier_use_scale_shift_norm,
+        classifier_resblock_updown,
+        classifier_pool,
+    )
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return classifier, diffusion
+
+
+def create_classifier(
+    image_size,
+    classifier_use_fp16,
+    classifier_width,
+    classifier_depth,
+    classifier_attention_resolutions,
+    classifier_use_scale_shift_norm,
+    classifier_resblock_updown,
+    classifier_pool,
+):
+    if image_size == 512:
+        channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
+    elif image_size == 256:
+        channel_mult = (1, 1, 2, 2, 4, 4)
+    elif image_size == 128:
+        channel_mult = (1, 1, 2, 3, 4)
+    elif image_size == 64:
+        channel_mult = (1, 2, 3, 4)
+    else:
+        raise ValueError(f"unsupported image size: {image_size}")
+
+    attention_ds = []
+    for res in classifier_attention_resolutions.split(","):
+        attention_ds.append(image_size // int(res))
+
+    return EncoderUNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=classifier_width,
+        out_channels=1000,
+        num_res_blocks=classifier_depth,
+        attention_resolutions=tuple(attention_ds),
+        channel_mult=channel_mult,
+        use_fp16=classifier_use_fp16,
+        num_head_channels=64,
+        use_scale_shift_norm=classifier_use_scale_shift_norm,
+        resblock_updown=classifier_resblock_updown,
+        pool=classifier_pool,
+    )
+
+
+def sr_model_and_diffusion_defaults():
+    res = model_and_diffusion_defaults()
+    res["large_size"] = 256
+    res["small_size"] = 64
+    arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0]
+    for k in res.copy().keys():
+        if k not in arg_names:
+            del res[k]
+    return res
+
+
+def sr_create_model_and_diffusion(
+    large_size,
+    small_size,
+    class_cond,
+    learn_sigma,
+    num_channels,
+    num_res_blocks,
+    num_heads,
+    num_head_channels,
+    num_heads_upsample,
+    attention_resolutions,
+    dropout,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+    use_checkpoint,
+    use_scale_shift_norm,
+    resblock_updown,
+    use_fp16,
+):
+    model = sr_create_model(
+        large_size,
+        small_size,
+        num_channels,
+        num_res_blocks,
+        learn_sigma=learn_sigma,
+        class_cond=class_cond,
+        use_checkpoint=use_checkpoint,
+        attention_resolutions=attention_resolutions,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        dropout=dropout,
+        resblock_updown=resblock_updown,
+        use_fp16=use_fp16,
+    )
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return model, diffusion
+
+
+def sr_create_model(
+    large_size,
+    small_size,
+    num_channels,
+    num_res_blocks,
+    learn_sigma,
+    class_cond,
+    use_checkpoint,
+    attention_resolutions,
+    num_heads,
+    num_head_channels,
+    num_heads_upsample,
+    use_scale_shift_norm,
+    dropout,
+    resblock_updown,
+    use_fp16,
+):
+    _ = small_size  # hack to prevent unused variable
+
+    if large_size == 512:
+        channel_mult = (1, 1, 2, 2, 4, 4)
+    elif large_size == 256:
+        channel_mult = (1, 1, 2, 2, 4, 4)
+    elif large_size == 64:
+        channel_mult = (1, 2, 3, 4)
+    else:
+        raise ValueError(f"unsupported large size: {large_size}")
+
+    attention_ds = []
+    for res in attention_resolutions.split(","):
+        attention_ds.append(large_size // int(res))
+
+    return SuperResModel(
+        image_size=large_size,
+        in_channels=3,
+        model_channels=num_channels,
+        out_channels=(3 if not learn_sigma else 6),
+        num_res_blocks=num_res_blocks,
+        attention_resolutions=tuple(attention_ds),
+        dropout=dropout,
+        channel_mult=channel_mult,
+        num_classes=(NUM_CLASSES if class_cond else None),
+        use_checkpoint=use_checkpoint,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        resblock_updown=resblock_updown,
+        use_fp16=use_fp16,
+    )
+
+
+def create_gaussian_diffusion(
+    *,
+    steps=1000,
+    learn_sigma=False,
+    sigma_small=False,
+    noise_schedule="linear",
+    use_kl=False,
+    predict_xstart=False,
+    rescale_timesteps=False,
+    rescale_learned_sigmas=False,
+    timestep_respacing="",
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if not timestep_respacing:
+        timestep_respacing = [steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type,
+        rescale_timesteps=rescale_timesteps,
+    )
+
+
+def add_dict_to_argparser(parser, default_dict):
+    for k, v in default_dict.items():
+        v_type = type(v)
+        if v is None:
+            v_type = str
+        elif isinstance(v, bool):
+            v_type = str2bool
+        parser.add_argument(f"--{k}", default=v, type=v_type)
+
+
+def args_to_dict(args, keys):
+    return {k: getattr(args, k) for k in keys}
+
+
+def str2bool(v):
+    """
+    https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("boolean value expected")
diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/unet.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..187b6c9737fda143f70e8dae365c35b690820466
--- /dev/null
+++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/unet.py
@@ -0,0 +1,975 @@
+from abc import abstractmethod
+
+import math
+
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .fp16_util import convert_module_to_f16, convert_module_to_f32
+from .nn import (
+    checkpoint,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    zero_module,
+    normalization,
+    timestep_embedding,
+)
+
+from transformers import PreTrainedModel, PretrainedConfig
+
+
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=1
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint)
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps, y=None):
+        """
+        Apply the model to an input batch.
+
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+        h = self.middle_block(h, emb)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+        h = h.type(x.dtype)
+        return self.out(h)
+
+
+class SuperResModel(UNetModel):
+    """
+    A UNetModel that performs super-resolution.
+
+    Expects an extra kwarg `low_res` to condition on a low-resolution image.
+    """
+
+    def __init__(self, image_size, in_channels, *args, **kwargs):
+        super().__init__(image_size, in_channels * 2, *args, **kwargs)
+
+    def forward(self, x, timesteps, low_res=None, **kwargs):
+        _, _, new_height, new_width = x.shape
+        upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear")
+        x = th.cat([x, upsampled], dim=1)
+        return super().forward(x, timesteps, **kwargs)
+
+
+class EncoderUNetModel(nn.Module):
+    """
+    The half UNet model with attention and timestep embedding.
+
+    For usage, see UNet.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+    ):
+        super().__init__()
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d(
+                    (image_size // ds), ch, num_head_channels, out_channels
+                ),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps):
+        """
+        Apply the model to an input batch.
+
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :return: an [N x K] Tensor of outputs.
+        """
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = th.cat(results, axis=-1)
+            return self.out(h)
+        else:
+            h = h.type(x.dtype)
+            return self.out(h)
+
+
+class UNetConfig(PretrainedConfig):
+    def __init__(
+        self,
+        image_size=512,
+        in_channels=3,
+        model_channels=256,
+        out_channels=6,
+        num_res_blocks=2,
+        attention_resolutions=[16, 32, 64],
+        dropout=0.0,
+        channel_mult=(0.5, 1, 1, 2, 2, 4, 4),
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=True,
+        num_heads=4,
+        num_head_channels=64,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=True,
+        resblock_updown=True,
+        use_new_attention_order=False,
+        **kwargs
+    ):
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.use_fp16 = use_fp16
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.resblock_updown = resblock_updown
+        self.use_new_attention_order = use_new_attention_order
+        super().__init__(**kwargs)
+
+
+class HFUNetModel(PreTrainedModel):
+    config_class = UNetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = UNetModel(
+            image_size=config.image_size,
+            in_channels=config.in_channels,
+            model_channels=config.model_channels,
+            out_channels=config.out_channels,
+            num_res_blocks=config.num_res_blocks,
+            attention_resolutions=config.attention_resolutions,
+            dropout=config.dropout,
+            channel_mult=config.channel_mult,
+            num_classes=config.num_classes,
+            use_checkpoint=config.use_checkpoint,
+            use_fp16=config.use_fp16,
+            num_heads=config.num_heads,
+            num_head_channels=config.num_head_channels,
+            num_heads_upsample=config.num_heads_upsample,
+            use_scale_shift_norm=config.use_scale_shift_norm,
+            resblock_updown=config.resblock_updown,
+            use_new_attention_order=config.use_new_attention_order,
+        )
+
+    def forward(self, x, timesteps, y=None):
+        return self.model.forward(x, timesteps, y)
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.model.input_blocks.apply(convert_module_to_f16)
+        self.model.middle_block.apply(convert_module_to_f16)
+        self.model.output_blocks.apply(convert_module_to_f16)
diff --git a/fengshen/examples/disco_project/st_disco.py b/fengshen/examples/disco_project/st_disco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8676ac2763aab65300bdcb588ac74c4e672745d5
--- /dev/null
+++ b/fengshen/examples/disco_project/st_disco.py
@@ -0,0 +1,56 @@
+# from disco_huge import Diffuser
+# from utils import *
+from disco import Diffuser
+import streamlit as st
+from io import BytesIO
+from PIL import Image
+from disco import steps
+
+
+@st.cache(show_spinner=False, allow_output_mutation=True)   # 加装饰器， 只加载一次。
+class ST_Diffuser(Diffuser):
+    def __init__(self, custom_path):
+        super().__init__(custom_path)
+
+
+if __name__ == '__main__':
+    dd = ST_Diffuser(custom_path="IDEA-CCNL/Taiyi-Diffusion-532M-Nature")  # 初始化
+    form = st.form("参数设置")
+    input_text = form.text_input('输入文本生成图像:', value='', placeholder='你想象的一个画面')
+    form.form_submit_button("提交")
+    uploaded_file = st.file_uploader("上传初始化图片（可选）", type=["jpg", "png", "jpeg"])
+
+    text_scale_norm = st.sidebar.slider('文本强度', 0.1, 1.0, 0.5, step=0.1)
+    text_scale = int(text_scale_norm * 10000)
+    res_skip_steps = st.sidebar.slider('加噪强度', 0.1, 1.0, 0.9, step=0.1)
+    skip_steps = int(steps - round(res_skip_steps * steps))
+    width = st.sidebar.slider('宽度', 384, 1024, 512, step=64)
+    heigth = st.sidebar.slider('高度', 384, 1024, 512, step=64)
+
+    with st.spinner('正在生成中...'):
+        capture_img = None
+        if uploaded_file is not None:
+            # To read file as bytes:
+            bytes_data = uploaded_file.getvalue()
+            # 将字节数据转化成字节流
+            bytes_data = BytesIO(bytes_data)
+            # Image.open()可以读字节流
+            capture_img = Image.open(bytes_data).convert('RGB').resize((width, heigth))
+
+            image_status = st.empty()
+            image_status.image(capture_img, use_column_width=True)
+        else:
+            image_status = st.empty()
+
+        if input_text:
+            # global text_prompts
+            input_text_prompts = [input_text]
+            image = dd.generate(input_text_prompts,
+                                capture_img,
+                                clip_guidance_scale=text_scale,
+                                skip_steps=skip_steps,
+                                st_dynamic_image=image_status,
+                                init_scale=None,
+                                side_x=width,
+                                side_y=heigth)   # 最终结果。实时显示修改generate里面的内容。
+            image_status.image(image, use_column_width=True)
diff --git a/fengshen/examples/finetune_bart_qg/README.md b/fengshen/examples/finetune_bart_qg/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..33457b448b4356062ad4b1b00a22f00122c4fe83
--- /dev/null
+++ b/fengshen/examples/finetune_bart_qg/README.md
@@ -0,0 +1,106 @@
+## Randeng-BART-139M-QG-Chinese
+
+
+
+## 简介 Brief Introduction
+
+善于处理问题生成任务的中文版 BART-base 模型。
+
+Good at solving question generation tasks Bart-base Model (Chinese version).
+
+## 模型分类 Model Taxonomy
+
+|  需求 Demand  | 任务 Task       | 系列 Series      | 模型 Model    | 参数 Parameter | 额外 Extra |
+|  :----:  | :----:  | :----:  | :----:  | :----:  | :----:  |
+| 通用 General | 自然语言转换 NLT | 燃灯 Randeng | BART |      139M      |    问题生成任务-中文 QuestionGeneration-Chinese    |
+
+
+## 模型信息 Model Information
+
+本模型基于[IDEA-CCNL/Randeng-BART-139M](https://huggingface.co./IDEA-CCNL/Randeng-BART-139M)，我们在 [ChineseSQuAD](https://github.com/pluto-junzeng/ChineseSquad) 数据集上微调了问题生成任务版本。
+
+Based on [IDEA-CCNL/Randeng-BART-139M](https://huggingface.co./IDEA-CCNL/Randeng-BART-139M), we fine-tuned a question generation version on [ChineseSQuAD](https://github.com/pluto-junzeng/ChineseSquad) datasets.
+
+
+Table1: 模型结构和配置 Model Architecture and Config
+
+|    配置 Config      |  参数 Value|
+| ------------------- | --------- |
+| encoder layers      |     6    |
+| encoder_attn_heads  |     12    |
+| encoder_ffn_dim     |     3072  |
+| decoder_layers      |     6    |
+| decoder_attn_heads  |     12    |
+| decoder_ffn_dim     |    3072   |
+| max_encoder_len     |    512    |
+
+
+ChineseSQuAD 数据集翻译了部分SQuAD数据集，包含约 67k 有答案的训练样本和 43k 无答案训练样本。我们做了 9:1 的训练-开发集合划分，并在公开的开发集上评测了效果。
+
+The dataset is translated from SQuAD 2.0, with around 67k samples with answers and 43k samples without answers. We split the data to train-dev with ratio of 9:1 and test the performance on the public dev set.
+
+Table 2: 数据集样本量
+|       | all    | have ans | no ans |
+|:------|:-------|:---------|:-------|
+| train_split | 100097 |    60879 |  39128 |
+| dev_split   |  11089 |     6809 |   4280 |
+| dev   |  10836 |     6645 |   4191 |
+
+
+## 使用 Usage
+
+### 环境安装 Install 
+```
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+git submodule init
+git submodule update
+# submodule是我们用来管理数据集的fs_datasets，通过ssh的方式拉取，如果用户没有在机器上配置ssh-key的话可能会拉取失败。
+# 如果拉取失败，需要到.gitmodules文件中把ssh地址改为https地址即可。
+pip install --editable .
+```
+
+
+### 模型加载  Loading Models
+```python
+from transformers import AutoTokenizer, BartForConditionalGeneration
+tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Randeng-BART-139M-QG-Chinese",additional_special_tokens=["<ans>"])
+model = BartForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-BART-139M-QG-Chinese")
+
+context = "知识：1939年9月1日德国入侵波兰后，第二次世界大战开始，华沙一直被保卫到9月27日。波兰中部，包括华沙，都在德国纳粹殖民地政府总政府的统治下。所有的高等教育机构都立即关闭，华沙的犹太人口——几十万，约占城市的 <ans> ——全部涌入华沙的贫民区。回答：30%"
+inputs = tokenizer.encode_plus(
+            context,
+            max_length=448,
+            padding="max_length",
+            truncation=True,
+            return_tensors='pt'
+        )
+out = model.generate(                
+        input_ids=inputs['input_ids'],
+        attention_mask=inputs['attention_mask'],
+        do_sample=True,
+        num_beams=5,
+        max_length=64,
+        top_p = 0.9,
+    )
+print(pred = tokenizer.batch_decode(out,clean_up_tokenization_spaces=True, skip_special_tokens=True)[0])
+# 问题:华沙的犹太人口占城市的百分之多少?
+```
+
+
+
+### 训练 train
+```python
+bash finetune_bart.sh
+```
+
+- finetune_bart.py 定义了数据处理输入输出方式和finetune的核心代码
+- finetune_bart.sh 训练脚本，具体参数可在此修改
+- utils.py 定义了独立的工具代码，重实现的函数等
+
+
+
+### 下游效果 Performance
+| Dataset          |  Size  | BLEU-4 | METEOR | ROUGE-L| 
+| ------------ | -----  | -------- |--------- | ---------- |
+|   ChineseSQuAD               |  139M   |  22.17 |   40.38  |   38.17   | 
diff --git a/fengshen/examples/finetune_bart_qg/finetune_bart.py b/fengshen/examples/finetune_bart_qg/finetune_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c64589edf6c146632e656c96b7195d4ae87d81
--- /dev/null
+++ b/fengshen/examples/finetune_bart_qg/finetune_bart.py
@@ -0,0 +1,429 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   finetune_bart.py
+@Time    :   2022/10/28 18:23
+@Author  :   Qi Yang
+@Version :   1.0
+@Contact :   yangqi@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+
+
+from fengshen.models.model_utils import configure_optimizers
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.utils import chinese_char_tokenize
+from utils import truncate_sequence, white_space_fix
+from utils import LabelSmoothingCrossEntropy
+import sys
+import os
+import torch
+import argparse
+import pytorch_lightning as pl
+from dataclasses import dataclass
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import LearningRateMonitor
+from transformers import BartForConditionalGeneration
+from transformers import BertTokenizer, AutoTokenizer
+from torchmetrics.text.rouge import ROUGEScore
+sys.path.append('../../../')
+
+
+@dataclass
+class QGT5Collator:
+    @ staticmethod
+    def add_data_specific_args(parent_args):
+        # the hyperparameters should be determined according to the max length of context in dataset
+        parser = parent_args.add_argument_group('BART DIalo Collator')
+        parser.add_argument('--max_seq_length', default=512, type=int)
+        parser.add_argument('--max_src_length', default=32, type=int)
+        parser.add_argument('--max_kno_length', default=416, type=int)
+        parser.add_argument('--max_tgt_length', default=64, type=int)
+        parser.add_argument('--mask_ans_style',
+                            default='normal',
+                            type=str,
+                            choices=['normal', 'unmask', 'anstoken', 'postag', 'anstoken_multispan', 'postag_multispan', 'normal_multispan'])
+        return parent_args
+
+    def __init__(self, tokenizer, args):
+        self.args = args
+        self.tokenizer = tokenizer
+        self.max_seq_length = args.max_seq_length
+        self.print_example = True
+        self.mask_ans_style = args.mask_ans_style
+        self.do_eval_only = args.do_eval_only
+        self.tokenizer_type = args.tokenizer_type
+
+    def encode(self, x, y):
+        if self.tokenizer_type == "bert":
+            x = x
+            y = y
+        else:
+            # t5 sentence piece
+            x = self.tokenizer.bos_token + x + self.tokenizer.eos_token
+            y = y + self.tokenizer.eos_token
+
+        encoder_input = self.tokenizer.encode_plus(
+            x,
+            max_length=self.args.max_kno_length + self.args.max_src_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors='pt'
+        )
+        decoder_output = self.tokenizer.encode_plus(
+            y,
+            max_length=self.args.max_tgt_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors='pt'
+        )
+
+        return encoder_input, decoder_output
+
+    def mask(self, s):
+        def replace_span(source, target, sptoken):
+            ans_bos, ans_eos = s["ans_span"][0]
+            return source[:ans_bos] + sptoken + source[ans_eos:]
+
+        def replace_all(source, target, sptoken):
+            return source.replace(target, sptoken)
+
+        if 'multispan' in self.mask_ans_style:
+            fn = replace_all
+        else:
+            fn = replace_span
+
+        # unmask: 北京是中国的首都
+        if 'unmask' in self.mask_ans_style:
+            return s["context"]
+
+        # normal: 北京是 <mask> 的首都
+        if 'normal' in self.mask_ans_style:
+            self.anstoken = self.tokenizer.mask_token
+            masked_context = fn(s["context"], s["answer"][0], self.anstoken)
+            return masked_context
+
+        # anstoken: 北京是 [ANS] 的首都
+        if 'anstoken' in self.mask_ans_style:
+            anstoken_dict = {
+                "bert": "[ANS]",
+                "bart": "<ans>"
+            }
+            self.anstoken = anstoken_dict[self.tokenizer_type]
+            masked_context = fn(s["context"], s["answer"][0], self.anstoken)
+            return masked_context
+
+        # postag: 北京是 <beg> 中国 <eos> 的首都
+        if 'postag' in self.mask_ans_style:
+            begtoken, endtoken = "<beg>", "<eos>"
+            self.anstoken = begtoken + s["answer"][0] + endtoken
+            masked_context = fn(s["context"], s["answer"][0], self.anstoken)
+            return masked_context
+
+        return masked_context
+
+    def prompt(self, context, answer, question):
+        pre_prompt, mid_prompt, post_prompt = "知识:", "回答:", "问题:"  # prompt
+
+        context = truncate_sequence(context, self.args.max_kno_length-len(pre_prompt)-1)
+
+        # used in squad-2.0
+        # noted that src and tgt is reversed in qg
+        answer = truncate_sequence(answer, self.args.max_src_length - len(mid_prompt)-1)
+        question = truncate_sequence(question, self.args.max_tgt_length-len(post_prompt)-1)
+
+        x_trunc = f'{pre_prompt}{context}{mid_prompt}{answer}'
+        y_trunc = f'{post_prompt}{question}'
+        return x_trunc, y_trunc
+
+    def __call__(self, samples):
+        """
+        ans_num = 1 适用于 Train 数据只有 1 条 answer 取第一条情况
+        ans_num > 1 适用于 Dev 数据有多条 answer 情况
+        Input:
+        input_ids: input_ids (text + answer)
+        attn_mask: input attn mask
+        labels:   decoder_ids (question)
+        """
+        input_ids, attn_mask, labels = [], [], []
+        ans, qes, ctx, ans_spans, idxs, imp = [], [], [], [], [], []
+
+        for s in samples:
+            if self.do_eval_only:
+                # log origin answer to compare
+                ans.append(s["answer"])
+                qes.append(s["question"])
+                ctx.append(s["context"])
+                ans_spans.append(s["ans_span"])
+                idxs.append(s["idx"])
+
+            if "is_impossible" in s:
+                imp.append(s["is_impossible"])
+            else:
+                imp.append(False)  # SQUAD 1.0 don't have is_impossible
+
+            if not s["is_impossible"]:  # have ans and ans_span
+                context = self.mask(s)
+                answer = s["answer"][0]
+                question = s["question"]
+            else:  # no ans and ans_span
+                context = s["context"]
+                answer = "无答案"
+                question = s["question"]
+
+            x_trunc, y_trunc = self.prompt(context, answer, question)
+            encoder_input, decoder_output = self.encode(x_trunc, y_trunc)
+
+            input_ids.append(encoder_input["input_ids"])
+            attn_mask.append(encoder_input["attention_mask"])
+            labels.append(decoder_output["input_ids"])
+
+        labels = torch.cat(labels)
+        if self.tokenizer_type == "bart":
+            end_token_index = torch.where(labels == self.tokenizer.eos_token_id)[1]
+        else:
+            end_token_index = torch.where(labels == self.tokenizer.sep_token_id)[1]
+        for idx, end_idx in enumerate(end_token_index):
+            labels[idx][end_idx + 1:] = -100  # cross entropy cal
+
+        data = {
+            'input_ids': torch.cat(input_ids),
+            'attention_mask': torch.cat(attn_mask),
+            'labels': labels
+        }
+        if self.do_eval_only:
+            data.update({
+                'answer': ans,
+                'question': qes,
+                'context': ctx,
+                'ans_span': ans_spans,
+                'idx': idxs,
+                'is_impossible': imp
+            })
+
+        if self.print_example:
+            print(x_trunc)
+            print(y_trunc)
+            self.print_example = False
+
+        return data
+
+
+class BARTFinetuneModel(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--model_path', type=str, default='')
+        parser.add_argument('--learning_rate', default=1e-5, type=float)
+        parser.add_argument('--min_learning_rate', default=1e-7, type=float)
+        parser.add_argument('--lr_decay_steps', default=0, type=int)
+        parser.add_argument('--lr_decay_ratio', default=1.0, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup_steps', default=1000, type=int)
+        parser.add_argument('--warmup_ratio', default=0.01, type=float)
+        parser.add_argument('--label_smooth', default=0, type=float)
+        parser.add_argument('--new_token_path', default="./", type=str)  # save new token after add special token
+        parser.add_argument('--adam_beta1', default=0.9, type=float)
+        parser.add_argument('--adam_beta2', default=0.999, type=float)
+        parser.add_argument('--adam_epsilon', default=1e-8, type=float)
+        parser.add_argument('--scheduler_type', default='polynomial', type=str)
+
+        return parent_args
+
+    def __init__(self, tokenizer, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.model = BartForConditionalGeneration.from_pretrained(args.model_path)
+        self.tokenizer = tokenizer
+
+        # add special token ans
+        # self.tokenizer.save_vocabulary(self.args.model_path)
+        new_vocab = args.model_path+"/sp_vocab/"
+        if not os.path.exists(new_vocab):
+            os.makedirs(new_vocab)
+        self.tokenizer.save_pretrained(new_vocab)
+        self.model.resize_token_embeddings(len(tokenizer))
+        self.vocab_size = len(tokenizer)
+        self.rougescore = ROUGEScore(rouge_keys=('rougeL'), normalizer=lambda x: x)
+
+        if self.hparams.label_smooth:
+            self.loss_fct = LabelSmoothingCrossEntropy(smoothing=0.1)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            labels=batch['labels'])
+
+        loss = output.loss
+        if self.hparams.label_smooth:
+            loss = self.loss_fct(output.logits.view(-1, self.vocab_size), batch["labels"].view(-1))
+
+        self.log('train_loss', loss, sync_dist=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            labels=batch['labels'])
+        acc = self.compute_acc(output.logits, batch['labels'])
+        self.log('val_loss', output.loss, sync_dist=True)
+        self.log('val_acc', acc, sync_dist=True)
+        self.log('val_ppl', torch.exp(output.loss), sync_dist=True)
+
+        cond_output = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            do_sample=True,
+            num_beams=5,
+            early_stopping=True,
+            max_length=64,
+            top_p=0.9,
+        )
+
+        batch_label = torch.where(batch["labels"] != -100, batch["labels"], self.tokenizer.pad_token_id)
+        pred = self.tokenizer.batch_decode(cond_output, clean_up_tokenization_spaces=True, skip_special_tokens=True)
+        ques = self.tokenizer.batch_decode(batch_label, clean_up_tokenization_spaces=True, skip_special_tokens=True)
+
+        pred = [chinese_char_tokenize(white_space_fix(p)) for p in pred]
+        ques = [chinese_char_tokenize(white_space_fix(q)) for q in ques]
+        self.rougescore.update(pred, ques)
+
+        return pred
+
+    def validation_epoch_end(self, validation_step_outputs):
+        rouge = self.rougescore.compute()
+        self.log('val_rouge', rouge["rougeL_fmeasure"], sync_dist=True)
+
+    def on_predict_start(self):
+        self.loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
+
+    def predict_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            labels=batch['labels'])
+
+        loss_tensor = self.loss_fct(output.logits.transpose(1, 2), batch["labels"])
+        if self.hparams.tokenizer_type == 'bart':
+            eos_index = torch.where(batch['labels'] == self.tokenizer.eos_token_id)[1]
+        elif self.hparams.tokenizer_type == 'bert':
+            eos_index = torch.where(batch['labels'] == self.tokenizer.sep_token_id)[1]
+
+        loss = torch.sum(loss_tensor, dim=1) / eos_index
+
+        with torch.no_grad():
+            cond_output = self.model.generate(
+                input_ids=batch['input_ids'],
+                attention_mask=batch['attention_mask'],
+                do_sample=True,
+                num_beams=5,
+                max_length=64,
+                top_p=0.9,
+                output_scores=True,
+                return_dict_in_generate=True
+            )
+
+        pred = self.tokenizer.batch_decode(
+            cond_output.sequences, clean_up_tokenization_spaces=True, skip_special_tokens=True)  # ['sequences']
+        pred = [white_space_fix(p) for p in pred]  # remove prompt and white space
+        score = cond_output.sequences_scores
+        return pred, score, loss
+
+    def compute_acc(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/y_true.shape[0]
+        return acc
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            self.model.save_pretrained(os.path.join(
+                self.trainer.checkpoint_callback.dirpath,
+                'hf_pretrained_epoch{}_step{}'.format(checkpoint['epoch'], checkpoint['global_step'])))
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+def get_tokenizer(tokenizer_type, pretrained_model_path):
+    if tokenizer_type == 'bart':
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_path, use_fast=False, additional_special_tokens=["<ans>", "<beg>", "<end>"])
+        print(len(tokenizer))
+    elif tokenizer_type == 'bert':
+        tokenizer = BertTokenizer.from_pretrained(
+            pretrained_model_path, use_fast=False, additional_special_tokens=["[ANS]"])
+    return tokenizer
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Finetune BART for QG")
+    total_parser.add_argument('--do_eval_only', action='store_true', default=False)
+    total_parser.add_argument('--tokenizer_type', type=str, default="bart", choices=['bart', 'bert'])
+    total_parser.add_argument('--tensorboard_dir', type=str, default="bart")
+    total_parser.add_argument('--deepspeed')
+
+    total_parser = UniversalDataModule.add_data_specific_args(total_parser)
+    total_parser = QGT5Collator.add_data_specific_args(total_parser)
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+    total_parser = BARTFinetuneModel.add_model_specific_args(total_parser)
+    args = total_parser.parse_args()
+
+    tokenizer = get_tokenizer(args.tokenizer_type, args.model_path)
+    collator = QGT5Collator(tokenizer=tokenizer, args=args)
+    data_model = UniversalDataModule(collate_fn=collator, tokenizer=tokenizer, args=args)
+    print("Data load complete...")
+
+    if args.deepspeed is not None:
+        os.environ['PL_DEEPSPEED_CONFIG_PATH'] = args.deepspeed
+
+    model = BARTFinetuneModel(tokenizer, args)
+    checkpoint_callback = UniversalCheckpoint(args)
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[checkpoint_callback, lr_monitor]
+                                         )
+
+    if not args.do_eval_only:
+        trainer.fit(model, data_model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/finetune_bart_qg/finetune_bart.sh b/fengshen/examples/finetune_bart_qg/finetune_bart.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ae88b230fa223c3d2c519e4f09cb1c703319af48
--- /dev/null
+++ b/fengshen/examples/finetune_bart_qg/finetune_bart.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#SBATCH --job-name=bart_qg # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=10 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+set -x -e
+
+MODEL_NAME=IDEA-CCNL/Randeng-BART-139M
+RUN_NAME=bart_v0_test
+ROOT_DIR=../../workspace/log/$RUN_NAME
+
+config_json="$ROOT_DIR/$MODEL_NAME.ds_config.json"
+export MASTER_PORT=$[RANDOM%10000+40000]
+
+MICRO_BATCH_SIZE=32
+
+cat <<EOT > $config_json
+{
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+    "gradient_clipping": 1,
+    "zero_optimization": {
+        "stage": 1
+    },
+    "fp16": {
+        "enabled": true,
+    }
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=../../workspace/torch_extensions
+
+DATA_ARGS=" \
+        --train_file train.json \
+        --val_file dev.json \
+        --test_file test.json \
+        --tokenizer_type bart \
+        --num_workers 8 \
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --max_seq_lengt 512 \
+        --max_src_length 32 \
+        --max_kno_length 416 \
+        --max_tgt_length 64 \
+        --mask_ans_style anstoken_multispan \
+        "  
+
+MODEL_ARGS="\
+        --model_path $MODEL_NAME/ \
+        --learning_rate 1e-4 \
+        --min_learning_rate 1e-8 \
+        --lr_decay_steps 100000 \
+        --weight_decay 1e-2 \
+        --warmup_steps 1000 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_loss \
+        --save_top_k 3 \
+        --mode min \
+        --save_last \
+        --every_n_train_steps 5000 \
+        --save_ckpt_path $ROOT_DIR/ckpt/ \
+        --load_ckpt_path $ROOT_DIR/ckpt/ \
+        --filename model-{step:02d}-{train_loss:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --gradient_clip_val 1.0 \
+        --max_epochs 1 \
+        --gpus 1 \
+        --num_nodes 1 \
+        --strategy ddp \
+        --log_every_n_steps 100 \
+        --val_check_interval 0.5 \
+        --accumulate_grad_batches 1 \
+        --default_root_dir $ROOT_DIR \
+        --tensorboard_dir $ROOT_DIR \
+        --label_smooth 0.1 \
+    "
+
+
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+# test
+export SCRIPT_PATH=./finetune_bart.py
+
+python3 ${SCRIPT_PATH} $options > $ROOT_DIR/test.log
+
diff --git a/fengshen/examples/finetune_bart_qg/utils.py b/fengshen/examples/finetune_bart_qg/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..25cc1ef54673d3e7a465901eb905c4889f1397fd
--- /dev/null
+++ b/fengshen/examples/finetune_bart_qg/utils.py
@@ -0,0 +1,70 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   utils.py
+@Time    :   2022/10/28 18:27
+@Author  :   Qi Yang
+@Version :   1.0
+@Contact :   yangqi@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn.functional as F
+
+
+class LabelSmoothingCrossEntropy(torch.nn.Module):
+    def __init__(self, smoothing=0.1):
+        super(LabelSmoothingCrossEntropy, self).__init__()
+        self.smoothing = smoothing
+        self.ignore_index = -100
+
+    def forward(self, x, target):
+        confidence = 1. - self.smoothing
+        logprobs = F.log_softmax(x, dim=-1)
+        targets_ignore = torch.where(target != self.ignore_index, target, 0)
+        nll_loss = -logprobs.gather(dim=-1, index=targets_ignore.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
+
+
+def truncate_sequence(document: str, max_num_tokens: int, reverse=False):
+    total_length = len(document)
+    if total_length <= max_num_tokens:
+        return document
+    else:
+        if reverse:
+            return document[-1*max_num_tokens:]
+        else:
+            return document[:max_num_tokens]
+
+
+def padding_to_maxlength(ids, max_length, pad_id):
+    cur_len = len(ids)
+    len_diff = max_length - len(ids)
+    return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff
+
+
+def white_space_fix(text):
+    return "".join(text.split(" "))
+
+
+def remove_prompt(text):
+    if ":" in text:
+        return text.split(":")[1]
+    return text
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2b6037b10173b0f6f03563b7df8e0378821fb18f
Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.jpg differ
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.txt
new file mode 100644
index 0000000000000000000000000000000000000000..97b1b5ec990c1574dcdf9743392ef543e044e3ee
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.txt
@@ -0,0 +1 @@
+牛津高阶英汉双解词典 (第6版)(内容一致,印次、封面或原价不同,统一售价,随机发货
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2c70487b1ae5803d8249a56130b0cfd6fbf0d722
Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.jpg differ
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.txt
new file mode 100644
index 0000000000000000000000000000000000000000..14810ab1fa32db2e3d225a3aa2b5a6280ae596ea
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.txt
@@ -0,0 +1 @@
+照相机显示走和做购物的愉快的人民 股票视频
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a9c82a86204a28a28fcb5d2e6276cb677cc2a26f
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a18890cf53a412387ef810badb41917351f010920f49f07ccadcce6f0e990d29
+size 2083013
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6d1696755083f4b975a08e9521ab7677438b83e1
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.txt
@@ -0,0 +1 @@
+直升机战争VR图4
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..575d4da8cca0bb2e250352d0e0914bcd65d1886c
Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.jpg differ
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.txt
new file mode 100644
index 0000000000000000000000000000000000000000..43b4695f6b60f8183b94e50129e92915e9571cf3
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.txt
@@ -0,0 +1 @@
+彩绘百合花图片
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c99776bb2c9475923a4fbde0716397789741cce1
Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.jpg differ
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.txt
new file mode 100644
index 0000000000000000000000000000000000000000..890c536b1098772268d01a7760e79c89c646f55b
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.txt
@@ -0,0 +1 @@
+用与巧克力蛋糕的正确的新月形面包和在灰色木背景的一个桔子,其次洒与桃红色 图库摄影
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1bd7b79ebaaa257fbfac8af4f015dc83c5a038e2
Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.jpg differ
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d98543289903785609bc0d1d878321a69b80b231
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.txt
@@ -0,0 +1 @@
+可燃气体油管 库存图片
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate.sh b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8b7d5412f7bd75cb0700cca0699e029a022db7a7
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+#SBATCH --job-name=evaluate_model # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=1 # number of tasks to run per node
+#SBATCH --cpus-per-task=5 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH -o inference_log/%x-%j.log # output and error log file names (%x for job id)
+#SBATCH -p batch
+#SBATCH --qos=ai4cogsys
+
+export SCRIPT_PATH=./evaluate_model.py 
+
+MODEL_PATH=''
+
+srun python $SCRIPT_PATH $MODEL_PATH
\ No newline at end of file
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate_model.py b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b92b6a1e29bb31af553fd2e924a0a2b0dcdb4873
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate_model.py
@@ -0,0 +1,294 @@
+import pytorch_lightning as pl
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import timm
+from torchvision import transforms as T
+import open_clip
+import sys
+import torch
+import json
+from transformers import BertModel, BertTokenizer
+from PIL import Image
+from diffusers import StableDiffusionPipeline
+import random
+import os
+from tqdm import tqdm
+
+os.environ['CUDA_LAUNCH_BLOCKING']='1'
+torch.backends.cudnn.benchmark = True
+
+class AestheticsMLP(pl.LightningModule):
+    # 美学判别器是基于CLIP的基础上接了一个MLP
+    def __init__(self, input_size, xcol='emb', ycol='avg_rating'):
+        super().__init__()
+        self.input_size = input_size
+        self.xcol = xcol
+        self.ycol = ycol
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 1024),
+            #nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            #nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            #nn.ReLU(),
+            nn.Dropout(0.1),
+
+            nn.Linear(64, 16),
+            #nn.ReLU(),
+
+            nn.Linear(16, 1)
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+    def training_step(self, batch, batch_idx):
+            x = batch[self.xcol]
+            y = batch[self.ycol].reshape(-1, 1)
+            x_hat = self.layers(x)
+            loss = F.mse_loss(x_hat, y)
+            return loss
+    
+    def validation_step(self, batch, batch_idx):
+        x = batch[self.xcol]
+        y = batch[self.ycol].reshape(-1, 1)
+        x_hat = self.layers(x)
+        loss = F.mse_loss(x_hat, y)
+        return loss
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
+        return optimizer
+
+
+class WaterMarkModel(nn.Module):
+    def __init__(self, model_path='./watermark_model_v1.pt'):
+        super(WaterMarkModel, self).__init__()
+        # model definition
+        self.model = timm.create_model(
+                'efficientnet_b3a', pretrained=True, num_classes=2)
+
+        self.model.classifier = nn.Sequential(
+            # 1536 is the orginal in_features
+            nn.Linear(in_features=1536, out_features=625),
+            nn.ReLU(),  # ReLu to be the activation function
+            nn.Dropout(p=0.3),
+            nn.Linear(in_features=625, out_features=256),
+            nn.ReLU(),
+            nn.Linear(in_features=256, out_features=2),
+        )
+        self.model.load_state_dict(torch.load(model_path))
+    def forward(self, x):
+        return self.model(x)
+
+class FilterSystem:
+    def __init__(
+                    self, 
+                    clip_model_path="IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese",
+                    aesthetics_model_path="./ava+logos-l14-linearMSE.pth",
+                    watermark_model_path="./watermark_model_v1.pt"
+                ):
+        self.clip_model_path = clip_model_path
+        self.aesthetics_model_path = aesthetics_model_path
+        self.watermark_model_path = watermark_model_path
+        self.init_aesthetics_model()
+        self.init_clip_model()
+        self.init_watermark_model()
+
+    def init_clip_model(self, ):
+        # 此处初始化clip模型，返回模型、tokenizer、processor
+        text_encoder = BertModel.from_pretrained(self.clip_model_path).eval().cuda()
+        text_tokenizer = BertTokenizer.from_pretrained(self.clip_model_path)
+        clip_model, _, processor = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
+        clip_model = clip_model.eval().cuda()
+        self.text_encoder, self.text_tokenizer, self.clip_model, self.processor = text_encoder, text_tokenizer, clip_model, processor
+        print("clip model loaded")
+        return None
+
+    def init_aesthetics_model(self, ):
+        # 此处初始化美学模型
+        self.aesthetics_model = AestheticsMLP(768)
+        self.aesthetics_model.load_state_dict(torch.load(self.aesthetics_model_path))
+        self.aesthetics_model.eval().cuda()
+        print("aesthetics model loaded")
+        return None
+
+    def init_watermark_model(self, ):
+        self.watermark_model = WaterMarkModel(self.watermark_model_path)
+        self.watermark_model.eval().cuda()
+        self.watermark_processor =  T.Compose([
+                                                T.Resize((256, 256)),
+                                                T.ToTensor(),
+                                                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+                                            ])
+        print("watermark model loaded")
+        return None
+
+    def get_image_feature(self, images):
+        # 此处返回图像的特征向量
+        if isinstance(images, list):
+            images = torch.stack([self.processor(image) for image in images]).cuda()
+        elif isinstance(images, torch.Tensor):
+            images = images.cuda()
+        else:
+            images = self.processor(images).cuda()
+
+        with torch.no_grad():
+            image_features = self.clip_model.encode_image(images)
+            image_features /= image_features.norm(dim=1, keepdim=True)
+        return image_features
+    
+    def get_text_feature(self, text):
+        # 此处返回文本的特征向量
+        if isinstance(text, list) or isinstance(text, str):
+            text = self.text_tokenizer(text, return_tensors='pt', padding=True)['input_ids'].cuda()
+        elif isinstance(text, torch.Tensor):
+            text = text.cuda()
+
+        with torch.no_grad():
+            text_features = self.text_encoder(text)[1]
+            text_features /= text_features.norm(dim=1, keepdim=True)
+        return text_features
+
+    def calculate_clip_score(self, features1, features2):
+        # 此处2个特征向量的相似度，输入可以是 图片+文本、文本+文本、图片+图片。
+        # 返回的是相似度矩阵，维度为 f1.shape[0] * f2.shape[0]
+        score_matrix =  features1 @ features2.t()
+        return score_matrix
+
+    def get_clip_score(self, text, image):
+        text_feature = self.get_text_feature(text)
+        image_feature = self.get_image_feature(image)
+        return self.calculate_clip_score(text_feature, image_feature)
+
+    def get_aesthetics_score(self, features):
+        # 此处返回美学分数，传入的是CLIP的feature, 先计算get_image_feature在传入此函数~(模型是ViT-L-14)
+        with torch.no_grad():
+            scores = self.aesthetics_model(features)
+            scores = scores[:, 0].detach().cpu().numpy()
+        return scores
+
+    def get_watermark_score(self, images):
+        if isinstance(images, list):
+            images = torch.stack([self.watermark_processor(image) for image in images]).cuda()
+        elif isinstance(images, torch.Tensor):
+            images = images.cuda()
+        with torch.no_grad():
+            pred = self.watermark_model(images)
+            watermark_scores = F.softmax(pred, dim=1)[:,0].detach().cpu().numpy()
+
+        return watermark_scores
+
+class InferenceFlickr:
+    def __init__(self, sd_model_list, sample_num=20, guidance_scale=7.5, test_caption_path="/cognitive_comp/chenweifeng/project/dataset/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt"):
+        self.model_name_list = sd_model_list
+        self.guidance_scale = guidance_scale
+        self.sample_num=sample_num
+        self.score_model = FilterSystem()
+        self.caption_path = test_caption_path
+        self.score = dict()
+        self.final_score = dict()
+
+    def init_model(self):
+        self.model_list = []
+        for model_name in self.model_name_list:
+            pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")
+            self.model_list.append(pipe)
+
+    def generate_image_score(self, prompt_list, model_list):
+        generator = torch.Generator(device=0)
+        generator = generator.manual_seed(42)
+        # num_images = 1
+        # latents = None
+        # seeds = []
+        # for _ in range(num_images):
+        #     generator = generator.manual_seed(42)
+            
+        #     image_latents = torch.randn(
+        #         (1, pipe.unet.in_channels, 512 // 8, 512 // 8),
+        #         generator = generator,
+        #         device =1
+        #     )
+        #     latents = image_latents if latents is None else torch.cat((latents, image_latents))
+        for i, model in enumerate(model_list):
+            model_name = self.model_name_list[i]
+            self.score[model_name] = dict()
+            for j, prompt in tqdm(enumerate(prompt_list)):
+                latents = None
+                image_latents = torch.randn(
+                    (1, model.unet.in_channels, 512 // 8, 512 // 8),
+                    generator = generator,
+                    device =0,
+                    dtype=torch.float16
+                )
+                latents = image_latents if latents is None else torch.cat((latents, image_latents))
+                image = model(prompt, guidance_scale=self.guidance_scale, latents=latents, torch_dtype=torch.float16).images[0]
+                image_feature = self.score_model.get_image_feature([image])
+                text_feature = self.score_model.get_text_feature(prompt)
+                image_clip_score = self.score_model.calculate_clip_score(image_feature, text_feature)
+                image_watermark_score = self.score_model.get_watermark_score([image])
+                image_aesthetics_score =self.score_model.get_aesthetics_score(image_feature)
+                self.score[model_name][prompt] = {
+                    "clip_score": float(image_clip_score[0][0]),
+                    "watermark_score": float(image_watermark_score[0]),
+                    "aesthetics_score": float(image_aesthetics_score[0]),
+                }
+                image.save(f"tmp/{prompt}_model-{str(i)}.png")
+
+    def get_prompt_list(self, seed=42, ):
+        with open(self.caption_path) as fin:
+            input_lines = fin.readlines()
+        tmp_list = []
+        for line in input_lines:
+            infos = line.strip('\n').split('\t')
+            prompt = infos[1]
+            tmp_list.append(prompt)
+        random.seed(seed)
+        prompt_list = random.sample(tmp_list, self.sample_num)
+        return prompt_list
+
+    def run(self):
+        self.init_model()
+        prompt_list = self.get_prompt_list()
+        self.generate_image_score(prompt_list, self.model_list)
+        
+    def show(self, save_path=None):
+        # print(self.score)
+        print(self.final_score)
+        if save_path:
+            with open(save_path, 'w') as fout:
+                json.dump(fout, self.final_score, indent=2, ensure_ascii=False)
+    
+    def calculate_score(self,):
+        for model_name in self.score.keys():
+            clip_score = 0.0
+            watermark_score = 0.0
+            aesthetics_score = 0.0
+            for prompt in self.score[model_name]:
+                clip_score += self.score[model_name][prompt]['clip_score']
+                watermark_score += self.score[model_name][prompt]['watermark_score']
+                aesthetics_score += self.score[model_name][prompt]['aesthetics_score']
+            average_clip_score = clip_score / len(self.score[model_name].keys())
+            average_watermark_score = watermark_score / len(self.score[model_name].keys())
+            average_aesthetics_score = aesthetics_score / len(self.score[model_name].keys())
+            self.final_score[model_name] = {"avg_clip": average_clip_score, "avg_watermark": average_watermark_score, 'avg_aesthetics': average_aesthetics_score}
+
+def main():
+    model_path = sys.argv[1]
+    model_list = [
+        # '/cognitive_comp/chenweifeng/project/stable-diffusion-lightning/finetune_taiyi_v0.40_laion',
+        # '/cognitive_comp/chenweifeng/project/stable-diffusion-chinese/finetune_taiyi0'
+        # "/cognitive_comp/lixiayu/diffuser_models/wukong_epoch1"
+        # "/cognitive_comp/lixiayu/work/Fengshenbang-LM/fengshen/workspace/taiyi-stablediffusion-laion/60per_ckpt",
+        model_path
+    ]
+    score_model = InferenceFlickr(model_list, sample_num=1000)
+    score_model.run()
+    score_model.calculate_score()
+    score_model.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.py b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f27358402cd0de23353acf6eaedf247949ec0a
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.py
@@ -0,0 +1,188 @@
+import os
+import torch
+import argparse
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from transformers import BertTokenizer, BertModel
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from torch.nn import functional as F
+from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data
+from torchvision import transforms
+from PIL import Image
+from torch.utils.data._utils.collate import default_collate
+
+
+class Collator():
+    def __init__(self, args, tokenizer):
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(
+                    args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        self.tokenizer = tokenizer
+
+    def __call__(self, inputs):
+        examples = []
+        max_length = min(max([len(i['caption']) for i in inputs]), 512)
+        for i in inputs:
+            example = {}
+            instance_image = Image.open(i['img_path'])
+            if not instance_image.mode == "RGB":
+                instance_image = instance_image.convert("RGB")
+            example["pixel_values"] = self.image_transforms(instance_image)
+            example["input_ids"] = self.tokenizer(
+                i['caption'],
+                padding="max_length",
+                truncation=True,
+                max_length=max_length,
+                return_tensors='pt',
+            )['input_ids'][0]
+            examples.append(example)
+        return default_collate(examples)
+
+
+class StableDiffusion(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Taiyi Stable Diffusion Module')
+        parser.add_argument('--freeze_unet', action='store_true', default=False)
+        parser.add_argument('--freeze_text_encoder', action='store_true', default=False)
+        return parent_parser
+
+    def __init__(self, args):
+        super().__init__()
+        self.tokenizer = BertTokenizer.from_pretrained(
+            args.model_path, subfolder="tokenizer")
+        self.text_encoder = BertModel.from_pretrained(
+            args.model_path, subfolder="text_encoder")  # load from taiyi_finetune-v0
+        self.vae = AutoencoderKL.from_pretrained(
+            args.model_path, subfolder="vae")
+        self.unet = UNet2DConditionModel.from_pretrained(
+            args.model_path, subfolder="unet")
+        # TODO: 使用xformers配合deepspeed速度反而有下降(待确认
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+        self.noise_scheduler = DDPMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+        )
+
+        for param in self.vae.parameters():
+            param.requires_grad = False
+
+        if args.freeze_text_encoder:
+            for param in self.text_encoder.parameters():
+                param.requires_grad = False
+
+        if args.freeze_unet:
+            for param in self.unet.parameters():
+                param.requires_grad = False
+
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def training_step(self, batch, batch_idx):
+        self.text_encoder.train()
+
+        latents = self.vae.encode(batch["pixel_values"]).latent_dist.sample()
+        latents = latents * 0.18215
+
+        # Sample noise that we'll add to the latents
+        noise = torch.randn(latents.shape).to(latents.device)
+        noise = noise.to(dtype=self.unet.dtype)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+
+        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        noisy_latents = noisy_latents.to(dtype=self.unet.dtype)
+
+        # Get the text embedding for conditioning
+        encoder_hidden_states = self.text_encoder(batch["input_ids"])[0]
+
+        # Predict the noise residual
+        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+        loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+        self.log("train_loss", loss.item(),  on_epoch=False, prog_bar=True, logger=True)
+
+        if self.trainer.global_rank == 0 and self.global_step == 100:
+            # 打印显存占用
+            from fengshen.utils.utils import report_memory
+            report_memory('stable diffusion')
+
+        return {"loss": loss}
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        if self.trainer.global_rank == 0:
+            print('saving model...')
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                self.hparams.model_path,
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                unet=self.unet)
+            self.trainer.current_epoch
+            pipeline.save_pretrained(os.path.join(
+                args.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}'))
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = add_data_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = StableDiffusion.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    model = StableDiffusion(args)
+    tokenizer = model.tokenizer
+    datasets = load_data(args, global_rank=trainer.global_rank)
+    collate_fn = Collator(args, tokenizer)
+
+    datamoule = UniversalDataModule(
+        tokenizer=tokenizer, collate_fn=collate_fn, args=args, datasets=datasets)
+
+    trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.sh b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5e6dab37e15c396776da02e9c549c048dff6f259
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --job-name=finetune_taiyi # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+ROOT_DIR=../../workspace
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=taiyi-stablediffusion-1B
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=1
+
+# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin
+CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json"
+ZERO_STAGE=1
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $CONFIG_JSON
+{
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+    "bf16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON
+### End
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --datasets_path ./demo_dataset \
+        --datasets_type txt \
+        --resolution 512 \
+        "
+
+MODEL_ARGS="\
+        --model_path IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 \
+        --learning_rate 1e-4 \
+        --weight_decay 1e-1 \
+        --warmup_ratio 0.01 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --save_last \
+        --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+        --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \
+        "
+
+TRAINER_ARGS="\
+        --max_epoch 10 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --log_every_n_steps 100 \
+        --precision bf16 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        --num_sanity_val_steps 0 \
+        --limit_val_batches 0 \
+        "
+# num_sanity_val_steps， limit_val_batches 通过这俩参数把validation关了
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+python3 finetune.py $options
+#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain_deberta.py $options
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..94d4593aa3a03f19f007cd18cf747f580d20a6e4
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb
@@ -0,0 +1,601 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU",
+    "gpuClass": "standard"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 🖌️ **Finetuning Taiyi-Stable-Diffusion Colab Example**\n",
+        "\n",
+        "#####based on https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1\n"
+      ],
+      "metadata": {
+        "id": "-GisYq7cG41a"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Installing fengshen framework"
+      ],
+      "metadata": {
+        "id": "twrdGg5zaY0m"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from IPython.display import clear_output\n",
+        "\n",
+        "!pip install pytorch_lightning\n",
+        "!pip install transformers\n",
+        "!pip install deepspeed\n",
+        "!pip install diffusers\n",
+        "!pip install datasets\n",
+        "!pip install accelerate\n",
+        "\n",
+        "!git clone https://github.com/IDEA-CCNL/Fengshenbang-LM\n",
+        "\n",
+        "clear_output()\n",
+        "print(\"Done!\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Y24PHP7dG4gj",
+        "outputId": "8c444a57-dfc8-4e6e-84f6-f7cbdde03c68"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Done!\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "\n",
+        "# 切换工作路径\n",
+        "os.chdir('/content/Fengshenbang-LM')\n",
+        "print(os.getcwd())"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lwZ2CAgkLgda",
+        "outputId": "d2471d59-c1a5-43d1-fb19-055bf8dfde2c"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content/Fengshenbang-LM\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Building modules"
+      ],
+      "metadata": {
+        "id": "EMYaGij5acpb"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CnXybs4VFJnz"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import torch\n",
+        "import argparse\n",
+        "from pytorch_lightning import (\n",
+        "    LightningModule,\n",
+        "    Trainer,\n",
+        ")\n",
+        "from pytorch_lightning.callbacks import (\n",
+        "    LearningRateMonitor,\n",
+        ")\n",
+        "from fengshen.data.universal_datamodule import UniversalDataModule\n",
+        "from fengshen.models.model_utils import (\n",
+        "    add_module_args,\n",
+        "    configure_optimizers,\n",
+        "    get_total_steps,\n",
+        ")\n",
+        "from fengshen.utils.universal_checkpoint import UniversalCheckpoint\n",
+        "from transformers import BertTokenizer, BertModel\n",
+        "from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel\n",
+        "from torch.nn import functional as F\n",
+        "from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data\n",
+        "from torchvision import transforms\n",
+        "from PIL import Image\n",
+        "from torch.utils.data._utils.collate import default_collate\n",
+        "\n",
+        "\n",
+        "class Collator():\n",
+        "    def __init__(self, args, tokenizer):\n",
+        "        self.image_transforms = transforms.Compose(\n",
+        "            [\n",
+        "                transforms.Resize(\n",
+        "                    args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),\n",
+        "                transforms.CenterCrop(\n",
+        "                    args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),\n",
+        "                transforms.ToTensor(),\n",
+        "                transforms.Normalize([0.5], [0.5]),\n",
+        "            ]\n",
+        "        )\n",
+        "        self.tokenizer = tokenizer\n",
+        "\n",
+        "    def __call__(self, inputs):\n",
+        "        examples = []\n",
+        "        max_length = min(max([len(i['caption']) for i in inputs]), 512)\n",
+        "        for i in inputs:\n",
+        "            example = {}\n",
+        "            instance_image = Image.open(i['img_path'])\n",
+        "            if not instance_image.mode == \"RGB\":\n",
+        "                instance_image = instance_image.convert(\"RGB\")\n",
+        "            example[\"pixel_values\"] = self.image_transforms(instance_image)\n",
+        "            example[\"input_ids\"] = self.tokenizer(\n",
+        "                i['caption'],\n",
+        "                padding=\"max_length\",\n",
+        "                truncation=True,\n",
+        "                max_length=max_length,\n",
+        "                return_tensors='pt',\n",
+        "            )['input_ids'][0]\n",
+        "            examples.append(example)\n",
+        "        return default_collate(examples)\n",
+        "\n",
+        "class StableDiffusion(LightningModule):\n",
+        "    @staticmethod\n",
+        "    def add_module_specific_args(parent_parser):\n",
+        "        parser = parent_parser.add_argument_group('Taiyi Stable Diffusion Module')\n",
+        "        parser.add_argument('--freeze_unet', action='store_true', default=False)\n",
+        "        parser.add_argument('--freeze_text_encoder', action='store_true', default=False)\n",
+        "        return parent_parser\n",
+        "\n",
+        "    def __init__(self, args):\n",
+        "        super().__init__()\n",
+        "        self.tokenizer = BertTokenizer.from_pretrained(\n",
+        "            args.model_path, subfolder=\"tokenizer\")\n",
+        "        self.text_encoder = BertModel.from_pretrained(\n",
+        "            args.model_path, subfolder=\"text_encoder\")  # load from taiyi_finetune-v0\n",
+        "        self.vae = AutoencoderKL.from_pretrained(\n",
+        "            args.model_path, subfolder=\"vae\")\n",
+        "        self.unet = UNet2DConditionModel.from_pretrained(\n",
+        "            args.model_path, subfolder=\"unet\")\n",
+        "        # TODO: 使用xformers配合deepspeed速度反而有下降(待确认\n",
+        "        self.unet.set_use_memory_efficient_attention_xformers(False)\n",
+        "\n",
+        "        self.noise_scheduler = DDPMScheduler(\n",
+        "            beta_start=0.00085, beta_end=0.012, beta_schedule=\"scaled_linear\", num_train_timesteps=1000\n",
+        "        )\n",
+        "\n",
+        "        for param in self.vae.parameters():\n",
+        "            param.requires_grad = False\n",
+        "\n",
+        "        if args.freeze_text_encoder:\n",
+        "            for param in self.text_encoder.parameters():\n",
+        "                param.requires_grad = False\n",
+        "\n",
+        "        if args.freeze_unet:\n",
+        "            for param in self.unet.parameters():\n",
+        "                param.requires_grad = False\n",
+        "\n",
+        "        self.save_hyperparameters(args)\n",
+        "\n",
+        "    def setup(self, stage) -> None:\n",
+        "        if stage == 'fit':\n",
+        "            self.total_steps = get_total_steps(self.trainer, self.hparams)\n",
+        "            print('Total steps: {}' .format(self.total_steps))\n",
+        "\n",
+        "    def configure_optimizers(self):\n",
+        "        return configure_optimizers(self)\n",
+        "\n",
+        "    def training_step(self, batch, batch_idx):\n",
+        "        self.text_encoder.train()\n",
+        "\n",
+        "        latents = self.vae.encode(batch[\"pixel_values\"]).latent_dist.sample()\n",
+        "        latents = latents * 0.18215\n",
+        "\n",
+        "        # Sample noise that we'll add to the latents\n",
+        "        noise = torch.randn(latents.shape).to(latents.device)\n",
+        "        noise = noise.to(dtype=self.unet.dtype)\n",
+        "        bsz = latents.shape[0]\n",
+        "        # Sample a random timestep for each image\n",
+        "        timesteps = torch.randint(\n",
+        "            0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)\n",
+        "        timesteps = timesteps.long()\n",
+        "        # Add noise to the latents according to the noise magnitude at each timestep\n",
+        "        # (this is the forward diffusion process)\n",
+        "\n",
+        "        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)\n",
+        "        noisy_latents = noisy_latents.to(dtype=self.unet.dtype)\n",
+        "\n",
+        "        # Get the text embedding for conditioning\n",
+        "        encoder_hidden_states = self.text_encoder(batch[\"input_ids\"])[0]\n",
+        "\n",
+        "        # Predict the noise residual\n",
+        "        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample\n",
+        "\n",
+        "        loss = F.mse_loss(noise_pred, noise, reduction=\"none\").mean([1, 2, 3]).mean()\n",
+        "        self.log(\"train_loss\", loss.item(),  on_epoch=False, prog_bar=True, logger=True)\n",
+        "\n",
+        "        if self.trainer.global_rank == 0 and self.global_step == 100:\n",
+        "            # 打印显存占用\n",
+        "            from fengshen.utils.utils import report_memory\n",
+        "            report_memory('stable diffusion')\n",
+        "\n",
+        "        return {\"loss\": loss}\n",
+        "\n",
+        "    def on_save_checkpoint(self, checkpoint) -> None:\n",
+        "        if self.trainer.global_rank == 0:\n",
+        "            print('saving model...')\n",
+        "            pipeline = StableDiffusionPipeline.from_pretrained(\n",
+        "                self.hparams.model_path,\n",
+        "                text_encoder=self.text_encoder,\n",
+        "                tokenizer=self.tokenizer,\n",
+        "                unet=self.unet)\n",
+        "            self.trainer.current_epoch\n",
+        "            pipeline.save_pretrained(os.path.join(\n",
+        "                args.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}'))\n",
+        "\n",
+        "    def on_load_checkpoint(self, checkpoint) -> None:\n",
+        "        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0\n",
+        "        global_step_offset = checkpoint[\"global_step\"]\n",
+        "        if 'global_samples' in checkpoint:\n",
+        "            self.consumed_samples = checkpoint['global_samples']\n",
+        "        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Settings"
+      ],
+      "metadata": {
+        "id": "jN-ATKxi1TUa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pprint import pprint\n",
+        "\n",
+        "args_parser = argparse.ArgumentParser()\n",
+        "args_parser = add_module_args(args_parser)\n",
+        "args_parser = add_data_args(args_parser)\n",
+        "args_parser = UniversalDataModule.add_data_specific_args(args_parser)\n",
+        "args_parser = Trainer.add_argparse_args(args_parser)\n",
+        "args_parser = StableDiffusion.add_module_specific_args(args_parser)\n",
+        "args_parser = UniversalCheckpoint.add_argparse_args(args_parser)\n",
+        "\n",
+        "# 你的数据集，可以参考 https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen/examples/finetune_taiyi_stable_diffusion 的demo_dataset的设置\n",
+        "your_dataset_path = '/content/Fengshenbang-LM/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset' #@param {type:\"string\"}\n",
+        "# 默认为下载huggingface上的模型\n",
+        "your_model_path =  'IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1' #@param {type:\"string\"}\n",
+        "train_batch_size = '1' #@param {type:\"string\"}\n",
+        "\n",
+        "message = [\n",
+        "    '--datasets_path', your_dataset_path,\n",
+        "    '--datasets_type', 'txt',\n",
+        "    '--model_path', your_model_path,\n",
+        "    '--train_batchsize', train_batch_size,\n",
+        "    '--accelerator', 'gpu',\n",
+        "    # '--strategy', 'deepspeed',\n",
+        "    '--precision', '16',\n",
+        "]\n",
+        "\n",
+        "args = args_parser.parse_args(args=message)\n",
+        "\n",
+        "pprint(vars(args), width = 230)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "KFnQFiQ_1S8w",
+        "outputId": "0802adc1-2b62-4557-96aa-796b5a2ca535"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "{'accelerator': 'gpu',\n",
+            " 'accumulate_grad_batches': None,\n",
+            " 'adam_beta1': 0.9,\n",
+            " 'adam_beta2': 0.999,\n",
+            " 'adam_epsilon': 1e-08,\n",
+            " 'amp_backend': None,\n",
+            " 'amp_level': None,\n",
+            " 'auto_lr_find': False,\n",
+            " 'auto_scale_batch_size': False,\n",
+            " 'auto_select_gpus': None,\n",
+            " 'benchmark': None,\n",
+            " 'center_crop': False,\n",
+            " 'check_val_every_n_epoch': 1,\n",
+            " 'dataloader_workers': 2,\n",
+            " 'datasets_name': None,\n",
+            " 'datasets_path': ['/content/Fengshenbang-LM/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset'],\n",
+            " 'datasets_type': ['txt'],\n",
+            " 'default_root_dir': None,\n",
+            " 'detect_anomaly': False,\n",
+            " 'devices': None,\n",
+            " 'enable_checkpointing': True,\n",
+            " 'enable_model_summary': True,\n",
+            " 'enable_progress_bar': True,\n",
+            " 'every_n_epochs': None,\n",
+            " 'every_n_train_steps': None,\n",
+            " 'fast_dev_run': False,\n",
+            " 'filename': 'model-ep{epoch:02d}-st{step:d}',\n",
+            " 'freeze_text_encoder': False,\n",
+            " 'freeze_unet': False,\n",
+            " 'gpus': None,\n",
+            " 'gradient_clip_algorithm': None,\n",
+            " 'gradient_clip_val': None,\n",
+            " 'inference_mode': True,\n",
+            " 'ipus': None,\n",
+            " 'learning_rate': 5e-05,\n",
+            " 'limit_predict_batches': None,\n",
+            " 'limit_test_batches': None,\n",
+            " 'limit_train_batches': None,\n",
+            " 'limit_val_batches': None,\n",
+            " 'load_ckpt_path': './ckpt/',\n",
+            " 'log_every_n_steps': 50,\n",
+            " 'logger': True,\n",
+            " 'lr_decay_ratio': 1.0,\n",
+            " 'lr_decay_steps': 0,\n",
+            " 'max_epochs': None,\n",
+            " 'max_steps': -1,\n",
+            " 'max_time': None,\n",
+            " 'min_epochs': None,\n",
+            " 'min_learning_rate': 1e-07,\n",
+            " 'min_steps': None,\n",
+            " 'mode': 'max',\n",
+            " 'model_path': 'IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1',\n",
+            " 'monitor': 'step',\n",
+            " 'move_metrics_to_cpu': False,\n",
+            " 'multiple_trainloader_mode': 'max_size_cycle',\n",
+            " 'num_nodes': 1,\n",
+            " 'num_processes': None,\n",
+            " 'num_sanity_val_steps': 2,\n",
+            " 'num_workers': 8,\n",
+            " 'overfit_batches': 0.0,\n",
+            " 'plugins': None,\n",
+            " 'precision': 16,\n",
+            " 'profiler': None,\n",
+            " 'raw_file_type': 'json',\n",
+            " 'reload_dataloaders_every_n_epochs': 0,\n",
+            " 'replace_sampler_ddp': True,\n",
+            " 'resolution': 512,\n",
+            " 'resume_from_checkpoint': None,\n",
+            " 'sampler_type': 'random',\n",
+            " 'save_ckpt_path': './ckpt/',\n",
+            " 'save_last': False,\n",
+            " 'save_on_train_epoch_end': None,\n",
+            " 'save_top_k': 10,\n",
+            " 'save_weights_only': False,\n",
+            " 'scheduler_type': 'polynomial',\n",
+            " 'strategy': None,\n",
+            " 'sync_batchnorm': False,\n",
+            " 'test_batchsize': 16,\n",
+            " 'test_datasets_field': 'test',\n",
+            " 'test_file': None,\n",
+            " 'thres': 0.2,\n",
+            " 'tpu_cores': None,\n",
+            " 'track_grad_norm': -1,\n",
+            " 'train_batchsize': 1,\n",
+            " 'train_datasets_field': 'train',\n",
+            " 'train_file': None,\n",
+            " 'val_batchsize': 16,\n",
+            " 'val_check_interval': None,\n",
+            " 'val_datasets_field': 'validation',\n",
+            " 'val_file': None,\n",
+            " 'warmup_ratio': 0.1,\n",
+            " 'warmup_steps': 0,\n",
+            " 'weight_decay': 0.1}\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Start training"
+      ],
+      "metadata": {
+        "id": "sgSAEhHoagek"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi\n",
+        "!cat /proc/meminfo"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yALlfBnj4AUF",
+        "outputId": "528bd7a5-1c9a-48e5-d92a-471ac9774dde"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mon Feb 13 05:26:48 2023       \n",
+            "+-----------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |\n",
+            "|-------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                               |                      |               MIG M. |\n",
+            "|===============================+======================+======================|\n",
+            "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   71C    P0    32W /  70W |      3MiB / 15360MiB |      0%      Default |\n",
+            "|                               |                      |                  N/A |\n",
+            "+-------------------------------+----------------------+----------------------+\n",
+            "                                                                               \n",
+            "+-----------------------------------------------------------------------------+\n",
+            "| Processes:                                                                  |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
+            "|        ID   ID                                                   Usage      |\n",
+            "|=============================================================================|\n",
+            "|  No running processes found                                                 |\n",
+            "+-----------------------------------------------------------------------------+\n",
+            "MemTotal:       26690612 kB\n",
+            "MemFree:        22642924 kB\n",
+            "MemAvailable:   24857828 kB\n",
+            "Buffers:           48944 kB\n",
+            "Cached:          2435656 kB\n",
+            "SwapCached:            0 kB\n",
+            "Active:           331644 kB\n",
+            "Inactive:        3390024 kB\n",
+            "Active(anon):       1568 kB\n",
+            "Inactive(anon):  1233664 kB\n",
+            "Active(file):     330076 kB\n",
+            "Inactive(file):  2156360 kB\n",
+            "Unevictable:           0 kB\n",
+            "Mlocked:               0 kB\n",
+            "SwapTotal:             0 kB\n",
+            "SwapFree:              0 kB\n",
+            "Dirty:               396 kB\n",
+            "Writeback:             0 kB\n",
+            "AnonPages:       1237216 kB\n",
+            "Mapped:           537236 kB\n",
+            "Shmem:              1304 kB\n",
+            "KReclaimable:     103288 kB\n",
+            "Slab:             147424 kB\n",
+            "SReclaimable:     103288 kB\n",
+            "SUnreclaim:        44136 kB\n",
+            "KernelStack:        5216 kB\n",
+            "PageTables:        21332 kB\n",
+            "NFS_Unstable:          0 kB\n",
+            "Bounce:                0 kB\n",
+            "WritebackTmp:          0 kB\n",
+            "CommitLimit:    13345304 kB\n",
+            "Committed_AS:    3402512 kB\n",
+            "VmallocTotal:   34359738367 kB\n",
+            "VmallocUsed:       57876 kB\n",
+            "VmallocChunk:          0 kB\n",
+            "Percpu:             2672 kB\n",
+            "HardwareCorrupted:     0 kB\n",
+            "AnonHugePages:         0 kB\n",
+            "ShmemHugePages:        0 kB\n",
+            "ShmemPmdMapped:        0 kB\n",
+            "FileHugePages:         0 kB\n",
+            "FilePmdMapped:         0 kB\n",
+            "CmaTotal:              0 kB\n",
+            "CmaFree:               0 kB\n",
+            "HugePages_Total:       0\n",
+            "HugePages_Free:        0\n",
+            "HugePages_Rsvd:        0\n",
+            "HugePages_Surp:        0\n",
+            "Hugepagesize:       2048 kB\n",
+            "Hugetlb:               0 kB\n",
+            "DirectMap4k:      484160 kB\n",
+            "DirectMap2M:    15241216 kB\n",
+            "DirectMap1G:    13631488 kB\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pytorch_lightning as pl\n",
+        "print(pl.__version__)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "s1vid40pLVDF",
+        "outputId": "25a92804-0fbd-4004-83fd-80e763286555"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "1.9.1\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "lr_monitor = LearningRateMonitor(logging_interval='step')\n",
+        "checkpoint_callback = UniversalCheckpoint(args)\n",
+        "\n",
+        "trainer = Trainer.from_argparse_args(args,\n",
+        "                                         callbacks=[\n",
+        "                                             lr_monitor,\n",
+        "                                             checkpoint_callback])\n",
+        "\n",
+        "model = StableDiffusion(args)\n",
+        "tokenizer = model.tokenizer\n",
+        "\n",
+        "datasets = load_data(args, global_rank=trainer.global_rank)\n",
+        "collate_fn = Collator(args, tokenizer)\n",
+        "\n",
+        "datamoule = UniversalDataModule(\n",
+        "    tokenizer=tokenizer, collate_fn=collate_fn, args=args, datasets=datasets)\n",
+        "\n",
+        "trainer.fit(model, datamoule)"
+      ],
+      "metadata": {
+        "id": "b4nSmmNrLVwG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "It might be OOM, which is caused by low GPU memory in Colab.\n",
+        "\n",
+        "This notebook proves that our codes can run in our settings."
+      ],
+      "metadata": {
+        "id": "9DnOM7qNbokd"
+      }
+    }
+  ]
+}
diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/readme.md b/fengshen/examples/finetune_taiyi_stable_diffusion/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..62c5b8b488ed6a45c0eab17cf59ceda0fc335194
--- /dev/null
+++ b/fengshen/examples/finetune_taiyi_stable_diffusion/readme.md
@@ -0,0 +1,46 @@
+# Taiyi-Stable-Diffusion Finetune示例
+
+本示例可以应用于**IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1**在自建的数据集上进行进一步训练，同时稍微修改代码也能够兼容大部分Stable-Diffusion结构。本示例仅提供参考，有任何疑问或者有需要协助的都可以提Issue到本项目中，会有专门的同学解答~
+
+注：已更新了[colab的example](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb)
+
+## 数据处理
+
+在./demo_dataset下有我们一个数据集的样例，其中一个sample由.jpg格式图片以及.txt文本文件组成，用户可以按照我们的格式处理然后直接将脚本内的datasets_path修改为自己的路径即可。(数据摘自[IDEA-CCNL/laion2B-multi-chinese-subset](https://huggingface.co./datasets/IDEA-CCNL/laion2B-multi-chinese-subset))
+
+## 配置要求
+
+Finetune **IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1** 十亿级别参数，我们自己测试所需要的配置基础如下。batch_size设定为1
+
+fp32:
+
+- 显存：26G以上
+- 内存：64G以上
+
+fp16:
+
+- 显存：21G以上
+- 内存：64G以上
+
+fp16 + deepspeed offload
+
+- 显存：6G以上
+- 内存：80G以上
+
+## 运行脚本
+
+处理好自己的数据集后，只需要将脚本中的datasets_path指向你的数据集，不需要修改其他参数就能运行。在脚本中也提供了丰富的超参供大家修改，例如batch_size, ckpt_path等等都可以根据自己的需求做更改，其中model_path指向的是huggingface上的模型路径，下载可能比较慢，如果用户已经在本地下载过一份权重，直接将model_path改成本地路径即可。
+
+一些常用的参数我们会放在[封神榜的文档里](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/%E5%B0%81%E7%A5%9E%E6%A1%86%E6%9E%B6/%E5%8F%82%E6%95%B0%E7%AE%A1%E7%90%86.html)
+
+有任何不清楚的地方，不要吝啬你的Issue，直接提过来。
+
+## 一些训练中的Trick
+
+### Deepspeed
+
+在示例中我们默认开始了Deepspeed，通过Deepspeed我们能提高不少训练效率（即使是单卡）。并且得益于Zero Redundancy Optimizer的技术，在多卡的环境我们能显著的减少显存占用，提高batch_size以获得更高的效率，强烈建议有条件的同学开启Deepspeed。
+
+### 8BitAdam
+
+TODO: 优化显存以及提高训练效率
diff --git a/fengshen/examples/hubert/pretrain_hubert.py b/fengshen/examples/hubert/pretrain_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..6506364b9498c5b994c085e1a5342082283ef62b
--- /dev/null
+++ b/fengshen/examples/hubert/pretrain_hubert.py
@@ -0,0 +1,287 @@
+import fengshen.data.hubert.hubert_dataset as datasets
+from fengshen.data.universal_datamodule import UniversalDataModule
+from transformers import HubertConfig, HubertModel
+# from transformers.models.hubert.modeling_hubert import _compute_mask_indices
+import argparse
+from fairseq.data import Dictionary
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+    loggers,
+)
+from pytorch_lightning.callbacks import LearningRateMonitor
+import torch
+import os
+import torch.nn.functional as F
+import torch.nn as nn
+
+
+class LabelEncoder(object):
+    def __init__(self, dictionary: Dictionary):
+        self.dictionary = dictionary
+
+    def __call__(self, label: str):
+        return self.dictionary.encode_line(
+            label,
+            append_eos=False,
+            add_if_not_exist=False,
+        )
+
+
+class HubertPretrainDataLoader():
+    def __init__(self, args):
+        self.cfg = args
+        self.dictionaries = self.load_dictionaries()
+        self.load_datasets = {}
+
+    # TODO 改成HuggingFace Tokenizer
+    def load_dictionaries(self):
+        label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir
+        dictionaries = [
+            Dictionary.load(f"{label_dir}/dict.{label}.txt")
+            for label in self.cfg.labels
+        ]
+        return dictionaries
+
+    def get_label_dir(self):
+        if self.cfg.label_dir is None:
+            return self.cfg.data
+        return self.cfg.label_dir
+
+    @property
+    def datasets(self):
+        return self.load_datasets
+
+    def load_dataset(self, split: str, **kwargs):
+        manifest = f"{self.cfg.data}/{split}.tsv"
+        dicts = self.dictionaries
+        pad_list = [dict.pad() for dict in dicts]
+        eos_list = [dict.eos() for dict in dicts]
+        procs = [LabelEncoder(dict) for dict in dicts]
+        paths = [f"{self.get_label_dir()}/{split}.{lb}" for lb in self.cfg.labels]
+
+        # hubert v1: pad_audio=True, random_crop=False;
+        self.load_datasets[split] = datasets.HubertDataset(
+            manifest,
+            sample_rate=self.cfg.sample_rate,
+            label_paths=paths,
+            label_rates=self.cfg.label_rate,
+            pad_list=pad_list,
+            eos_list=eos_list,
+            label_processors=procs,
+            max_keep_sample_size=self.cfg.max_keep_size,
+            min_keep_sample_size=self.cfg.min_sample_size,
+            max_sample_size=self.cfg.max_sample_size,
+            pad_audio=self.cfg.pad_audio,
+            normalize=self.cfg.normalize,
+            store_labels=False,
+            random_crop=self.cfg.random_crop,
+            single_target=self.cfg.single_target,
+        )
+
+
+def perpare_data(args):
+    loader = HubertPretrainDataLoader(args)
+    loader.load_dataset('train')
+    loader.load_dataset('valid')
+    return loader
+
+
+class HubertLightning(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('HuBert Lightning')
+        parser.add_argument('--pred_masked_weight', type=float, default=1.0)
+        parser.add_argument('--logit_temp', type=float, default=1.0)
+        parser.add_argument('--loss_weights', type=float, nargs='+')
+        # parser.add_argument('--mask_prob', type=float, default=0.65)
+        # parser.add_argument('--mask_length', type=int, default=10)
+        # parser.add_argument('--mask_selection', type=str, default='static',
+        #                     choice=["static", "uniform", "normal", "poisson"])
+        # parser.add_argument('--mask_other', type=float, default=0)
+        # parser.add_argument('--no_mask_overlap', type=bool, default=False)
+        # parser.add_argument('--mask_min_space', type=int, default=1)
+        return parent_parser
+
+    def __init__(self, args, loader, ** kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+        config = HubertConfig.from_pretrained(args.model_path)
+        self.config = config
+        self.model = HubertModel(config=config)
+        self.num_classes = [len(d) for d in loader.dictionaries]
+        self.label_embs_concat = nn.Parameter(
+            torch.FloatTensor(sum(self.num_classes), self.config.conv_dim[-1] // 2)
+        )
+        self.final_proj = nn.Linear(
+            self.config.hidden_size, self.config.conv_dim[-1] // 2 * len(loader.dictionaries)
+        )
+        nn.init.uniform_(self.label_embs_concat)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+    def compute_nce(self, x, pos, negs):
+        neg_is_pos = (pos == negs).all(-1)
+        pos = pos.unsqueeze(0)
+        targets = torch.cat([pos, negs], dim=0)
+
+        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x)
+        logits /= self.hparams.logit_temp
+        if neg_is_pos.any():
+            logits[1:][neg_is_pos] = float("-inf")
+        logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
+        return logits
+
+    def forward(self, **batch):
+
+        target_list = batch['target_list']
+        padding_mask = batch['net_input']['padding_mask']
+        input_values = batch['net_input']['source']
+        output = self.model(input_values=input_values,
+                            attention_mask=padding_mask,
+                            target_list=target_list,
+                            mask_time_indices=None,
+                            return_dict=False)
+
+        def compute_pred(proj_x, target, label_embs):
+            # compute logits for the i-th label set
+            y = torch.index_select(label_embs, 0, target.long())
+            negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1)
+            # proj_x: (S, D)
+            # y: (S, D)
+            # negs: (Neg, S, D)
+            return self.compute_nce(proj_x, y, negs)
+
+        label_embs_list = self.label_embs_concat.split(self.num_classes, 0)
+
+        x, extra_losses, target_list, mask_indices, padding_mask = output[
+            0], output[-4], output[-3], output[-2], output[-1]
+
+        masked_indices = torch.logical_and(~padding_mask, mask_indices)
+        proj_x_m = self.final_proj(x[masked_indices])
+        proj_x_m_list = proj_x_m.chunk(len(target_list), dim=-1)
+        logp_m_list = [
+            compute_pred(proj_x_m, t[masked_indices], label_embs_list[i])
+            for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list))
+        ]
+
+        targ_m_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logp_m_list]
+
+        loss = 0.0
+        loss_m_list = []
+
+        for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)):
+            loss_m = F.cross_entropy(logp_m, targ_m)
+            loss_m_list.append(loss_m)
+            self.log(f"loss_m_{i}", loss_m.detach().item())
+
+        loss += self.hparams.pred_masked_weight * sum(loss_m_list)
+
+        loss_weights = self.hparams.loss_weights
+        if loss_weights is not None:
+            if torch.is_tensor(extra_losses):
+                extra_losses = [extra_losses]
+                names = ['extra']
+            if len(loss_weights) == 1 and len(extra_losses) != 1:
+                loss_weights = [loss_weights[0]] * len(extra_losses)
+            assert len(extra_losses) == len(
+                loss_weights
+            ), f"{len(extra_losses)}, {len(loss_weights)}"
+            for p, n, coef in zip(extra_losses, names, loss_weights):
+                if coef != 0 and p is not None:
+                    p = coef * p.float()
+                    loss += p
+                    self.log(f"loss_{n}", p.item())
+
+        return {'loss': loss}
+
+    def training_step(self, batch, batch_idx):
+        output = self(**batch)
+        self.log('train_loss', output['loss'])
+        return output
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float()) / y_true.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self(**batch)
+        # self.log('val_loss', output.loss, sync_dist=True)
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        # self.log('val_acc', acc, sync_dist=True)
+        return output
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # Save the current loop info in the mid of epoch
+        # if you lightning <= 1.6.0  uncomment the line below
+        # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict()
+        if self.trainer.global_rank == 0:
+            self.model.save_pretrained(os.path.join(
+                self.trainer.checkpoint_callback.dirpath,
+                'hf_pretrained_epoch{}_step{}'.format(self.trainer.current_epoch, self.trainer.global_step)))
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    from fengshen.utils import UniversalCheckpoint
+    from fengshen.models.model_utils import add_module_args
+    args_parser = add_module_args(args_parser)
+    args_parser = datasets.add_data_specific_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = HubertLightning.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args_parser.add_argument('--ckpt_path', type=str, )
+    args = args_parser.parse_args()
+
+    data_module = UniversalDataModule(args=args, tokenizer=None, collate_fn=None)
+    data_loader = perpare_data(args)
+    data_module.datasets = data_loader.datasets
+    module = HubertLightning(args, loader=data_loader)
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+        args.default_root_dir, 'logs/'),
+        name=os.path.basename(os.path.dirname(args.model_path)))
+    checkpoint_callback = UniversalCheckpoint(args).callbacks
+
+    if args.ckpt_path is not None and \
+            not os.path.exists(args.ckpt_path):
+        print('--------warning no checkpoint found--------, remove args')
+        args.ckpt_path = None
+
+    trainer = Trainer.from_argparse_args(args,
+                                         logger=logger,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    trainer.fit(module, data_module, ckpt_path=args.ckpt_path)
diff --git a/fengshen/examples/hubert/pretrain_hubert_base.sh b/fengshen/examples/hubert/pretrain_hubert_base.sh
new file mode 100644
index 0000000000000000000000000000000000000000..11e5ddf38361d51910c35b02f10b7e285ab3f0fb
--- /dev/null
+++ b/fengshen/examples/hubert/pretrain_hubert_base.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_bart # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+MODEL_NAME=hubert-base-ls960
+config_json="./$MODEL_NAME.ds_config.json"
+export MASTER_PORT=29503
+MICRO_BATCH_SIZE=8
+ZERO_STAGE=1
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "tensorboard": {
+        "enabled": true,
+        "output_path": "/data/training_model/fengshen-${MODEL_NAME}/ds-tb-logs",
+        "job_name": "${MODEL_NAME}"
+    },
+    "#flops_profiler": {
+        "enabled": true,
+        "profile_step": 200,
+        "detailed": true,
+        "output_file": null
+    },
+    "steps_per_print": 100,
+    "gradient_clipping": 1,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+    "zero_allow_untested_optimizer": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/home/gaoxinyu/torch_extendsions
+
+DATA_DIR=/data/common_data/librispeech_tsv/datas
+LABELS_DIR=/data/common_data/librispeech_tsv/labels
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE \
+        --val_batchsize 32 \
+        --test_batchsize 8  \
+        --val_datasets_field valid \
+        --test_datasets_field valid \
+        --sampler_type random \
+        --data ${DATA_DIR} \
+        --label_dir ${LABELS_DIR} \
+        --labels km \
+        --label_rate 100 \
+        --max_sample_size 250000 \
+        --min_sample_size 32000 \
+        --pad_audio False \
+        --random_crop True \
+        --normalize False \
+        "
+
+MODEL_ARGS="\
+        --model_path /data/pretrained_model/$MODEL_NAME/ \
+        --learning_rate 1e-4 \
+        --weight_decay 1e-2 \
+        --warmup_ratio 0.01 \
+        --pred_masked_weight 1.0 \
+        --loss_weights 10 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor train_loss \
+        --save_top_k 0 \
+        --mode min \
+        --every_n_train_steps 10000 \
+        --dirpath /data/training_model/ckpt/fengshen-$MODEL_NAME \
+        --filename model-{step:02d}-{train_loss:.4f} \
+        --every_n_epochs 0 \
+        --save_last \
+        --not_save_on_train_epoch_end \
+        "
+
+# deepspeed_stage_${ZERO_STAGE} \
+TRAINER_ARGS="\
+        --gradient_clip_val 1.0 \
+        --max_epochs 10 \
+        --gpus 2 \
+        --num_nodes 1 \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --log_every_n_steps 100 \
+        --val_check_interval 500 \
+	    --limit_val_batches 10 \
+        --accumulate_grad_batches 1 \
+        --precision 16 \
+        --ckpt_path /data/training_model/ckpt/fengshen-${MODEL_NAME}/last.ckpt \
+        --default_root_dir /data/training_model/fengshen-$MODEL_NAME \
+        "
+
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+export SCRIPT_PATH=pretrain_hubert.py
+
+eval python3 -m debugpy --listen localhost:53005 --wait-for-client $SCRIPT_PATH $options
diff --git a/fengshen/examples/longformer/README.md b/fengshen/examples/longformer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef4706898b87d2f10eff5df2db24ae3a182ce673
--- /dev/null
+++ b/fengshen/examples/longformer/README.md
@@ -0,0 +1,34 @@
+# longformer model (Chinese)，one model of [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM).
+We modify the original position code of longformer to rotational position coding，and on the basis of [chinese_roformer_L-12_H-768_A-12.zip](https://github.com/ZhuiyiTechnology/roformer), use 180G of data to continue training
+
+## Usage
+There is no structure of Longformer-base in [Transformers](https://github.com/huggingface/transformers), you can run follow code to get structure of longformer from [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM)
+
+ ```shell
+ git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+ ```
+
+### Load Model
+```python
+from fengshen import LongformerModel    
+from fengshen import LongformerConfig
+from transformers import BertTokenizer
+
+tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Erlangshen-Longformer-110M")
+config = LongformerConfig.from_pretrained("IDEA-CCNL/Erlangshen-Longformer-110M")
+model = LongformerModel.from_pretrained("IDEA-CCNL/Erlangshen-Longformer-110M")
+```
+
+
+
+## Citation
+If you find the resource is useful, please cite the following website in your paper.
+
+```
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2021},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
diff --git a/fengshen/examples/mt5_summary/fastapi_mt5_summary.py b/fengshen/examples/mt5_summary/fastapi_mt5_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..44adaf8f5855260c683c0bcfe7986ffccc9f25c4
--- /dev/null
+++ b/fengshen/examples/mt5_summary/fastapi_mt5_summary.py
@@ -0,0 +1,93 @@
+import os
+import sys
+import uvicorn
+import torch
+from fastapi import Body, FastAPI
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+import pytorch_lightning as pl
+sys.path.append(os.path.abspath(os.path.join(
+    os.path.dirname(__file__), os.path.pardir)))
+os.environ["CUDA_VISIBLE_DEVICES"] = '5'
+os.environ["MASTER_ADDR"] = '127.0.0.1'
+os.environ["MASTER_PORT"] = '6000'
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+print('device')
+pretrain_model_path = '/cognitive_comp/ganruyi/hf_models/google/mt5-large'
+# pretrain_model_path = 'google/mt5-small'
+model_path = '/cognitive_comp/ganruyi/fengshen/mt5_large_summary/ckpt/epoch-0-last.ckpt'
+tokenizer = T5Tokenizer.from_pretrained(pretrain_model_path)
+print('load tokenizer')
+
+
+class MT5FinetuneSummary(pl.LightningModule):
+
+    def __init__(self):
+        super().__init__()
+        self.model = MT5ForConditionalGeneration.from_pretrained(pretrain_model_path)
+
+
+model = MT5FinetuneSummary.load_from_checkpoint(model_path)
+print('load checkpoint')
+model.to(device)
+model.eval()
+app = FastAPI()
+print('server start')
+
+# def flask_gen(text: str, level: float = 0.9, n_sample: int = 5, length: int = 32, is_beam_search=False):
+
+
+@app.post('/mt5_summary')
+async def flask_gen(text: str = Body('', title='原文', embed=True),
+                    n_sample: int = 5, length: int = 32, is_beam_search=False):
+    if len(text) > 128:
+        text = text[:128]
+    text = 'summary:'+text
+    print(text)
+    # inputs = tokenizer(text, return_tensors='pt')
+    inputs = tokenizer.encode_plus(
+        text, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
+    # print(inputs)
+    if is_beam_search:
+        generated_ids = model.model.generate(
+            input_ids=inputs['input_ids'].to(device),
+            attention_mask=inputs['attention_mask'].to(device),
+            max_length=length,
+            num_beams=n_sample,
+            repetition_penalty=2.5,
+            length_penalty=1.0,
+            early_stopping=True,
+            num_return_sequences=n_sample
+        )
+    else:
+        generated_ids = model.model.generate(
+            input_ids=inputs['input_ids'].to(device),
+            attention_mask=inputs['attention_mask'].to(device),
+            max_length=length,
+            do_sample=True,
+            temperature=1.0,
+            top_p=1.0,
+            repetition_penalty=2.5,
+            # early_stopping=True,
+            num_return_sequences=n_sample
+        )
+    result = []
+    # print(tokenizer.all_special_tokens)
+    for sample in generated_ids:
+        preds = [tokenizer.decode(sample, skip_special_tokens=True,
+                                  clean_up_tokenization_spaces=True)]
+        preds = ''.join(preds)
+        # print(preds)
+        result.append(preds)
+    return result
+
+
+if __name__ == '__main__':
+    uvicorn.run(app, host="0.0.0.0", port=6607, log_level="debug")
+# #     article = "日前，方舟子发文直指林志颖旗下爱碧丽推销假保健品，引起哗然。调查发现，
+# 爱碧丽没有自己的生产加工厂。其胶原蛋白饮品无核心研发，全部代工生产。号称有“逆生长”功效的爱碧丽“梦幻奇迹限量组”售价>高达1080元，实际成本仅为每瓶4元！"
+#     article = '''在北京冬奥会自由式滑雪女子坡面障碍技巧决赛中，中国选手谷爱凌夺得银牌。祝贺谷爱凌！
+# 今天上午，自由式滑雪女子坡面障碍技巧决赛举行。决赛分三轮进行，取选手最佳成绩排名决出奖牌。
+# 第一跳，中国选手谷爱凌获得69.90分。在12位选手中排名第三。完成动作后，谷爱凌又扮了个鬼脸，甚是可爱。
+# 第二轮中，谷爱凌在道具区第三个障碍处失误，落地时摔倒。获得16.98分。网友：摔倒了也没关系，继续加油！
+# 在第二跳失误摔倒的情况下，谷爱凌顶住压力，第三跳稳稳发挥，流畅落地！获得86.23分！此轮比赛，共12位选手参赛，谷爱凌第10位出场。网友：看比赛时我比谷爱凌紧张，加油！'''
+    # flask_gen(article, length=30)
diff --git a/fengshen/examples/mt5_summary/mt5_summary.py b/fengshen/examples/mt5_summary/mt5_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..de564026ae7a32873cc39515f421adfb9d7e4568
--- /dev/null
+++ b/fengshen/examples/mt5_summary/mt5_summary.py
@@ -0,0 +1,233 @@
+from fengshen.data.task_dataloader.task_datasets import LCSTSDataModel
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+from transformers.optimization import get_linear_schedule_with_warmup
+from pytorch_lightning import Trainer, loggers
+from pytorch_lightning.callbacks import ModelCheckpoint
+from transformers import AutoTokenizer
+import pytorch_lightning as pl
+import json
+import argparse
+import torch
+import os
+import sys
+sys.path.append('./')
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = '4,5,6,7'
+
+
+def test():
+    tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+    article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    summary = "Weiter Verhandlung in Syrien."
+    article = "日前，方舟子发文直指林志颖旗下爱碧丽推销假保健品，引起哗然。调查发现，爱碧丽没有自己的生产加工厂。 \
+    其胶原蛋白饮品无核心研发，全部代工生产。号称有“逆生长”功效的爱碧丽“梦幻奇迹限量组”售价>高达1080元，实际成本仅为每瓶4元！"
+    summary = "林志颖公司疑涉虚假营销无厂房无研发"
+    inputs = tokenizer(article, rturn_tensors="pt")
+    tt = tokenizer.encode_plus(summary, max_length=64,
+                               padding='max_length', truncation='longest_first')
+    print('tt:', tt)
+    print('inputs:', inputs)
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(summary, return_tensors="pt")
+    print('labels:', labels)
+    print('origin labels:', tokenizer.decode(labels['input_ids'][0]))
+
+    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+    # outputs = model(input_ids=inputs["input_ids"], labels=labels["input_ids"])
+    # print(outputs.keys())
+
+    # evaluation
+    model.eval()
+    generated_ids = model.generate(
+        input_ids=inputs['input_ids'],
+        attention_mask=inputs['attention_mask'],
+        max_length=150,
+        num_beams=2,
+        repetition_penalty=2.5,
+        length_penalty=1.0,
+        early_stopping=True
+    )
+    preds = [tokenizer.decode(g, skip_special_tokens=True,
+                              clean_up_tokenization_spaces=True) for g in generated_ids]
+    print(preds)
+
+
+class MT5FinetuneSummaryModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./ckpt/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+        parser.add_argument('--save_last', action='store_true', default=True)
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename,
+                                         save_last=args.save_last)
+
+
+class MT5FinetuneSummary(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--learning_rate', default=1e-4, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        return parent_args
+
+    def __init__(self, args, num_data):
+        super().__init__()
+        self.args = args
+        self.num_data = num_data
+        print('num_data:', num_data)
+        self.model = MT5ForConditionalGeneration.from_pretrained(args.pretrained_model_path)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
+            self.total_step = int(self.trainer.max_epochs * self.num_data /
+                                  (max(1, num_gpus) * self.trainer.accumulate_grad_batches))
+            print('Total training step:', self.total_step)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'], labels=batch['labels'])
+        # output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('train_loss', output.loss)
+        return output.loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'], labels=batch['labels'])
+        # output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss)
+        # self.log('val_acc', acc)
+
+    def predict_step(self, batch, batch_idx):
+        text = batch['text']
+        summary = batch['summary']
+        generated_ids = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            max_length=self.args.max_dec_length
+        )
+        return {"pred": generated_ids, "text": text, "summary": summary}
+
+    def configure_optimizers(self):
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.args.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_step * self.args.warmup),
+            self.total_step)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+
+def save_test(data, args, data_model):
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_path)
+    with open(os.path.join(args.output_save_path), 'w', encoding='utf-8') as f:
+        for _, batch in enumerate(data):
+            texts = batch['text']
+            summarys = batch['summary']
+            preds = batch['pred']
+            for idx, pred_ids in enumerate(preds):
+                text = texts[idx]
+                summary = summarys[idx]
+                tmp_result = dict()
+                preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+                         for g in pred_ids]
+                tmp_result['summary'] = ''.join(preds)
+                tmp_result['label'] = summary
+                tmp_result['origin_text'] = text
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data+'\n')
+    print('save the result to '+args.output_save_path)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Summary Task")
+    total_parser.add_argument('--do_eval_only', action='store_true', default=False)
+    total_parser.add_argument('--pretrained_model_path', default='google/mt5-small', type=str)
+    total_parser.add_argument('--output_save_path', default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = LCSTSDataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = MT5FinetuneSummaryModelCheckpoint.add_argparse_args(total_parser)
+    total_parser = MT5FinetuneSummary.add_model_specific_args(total_parser)
+    # * Args for base model
+    args = total_parser.parse_args()
+
+    data_model = LCSTSDataModel(args)
+    if not args.do_eval_only:
+        model = MT5FinetuneSummary(args, len(data_model.train_dataloader()))
+        checkpoint_callback = MT5FinetuneSummaryModelCheckpoint(args).callbacks
+        logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+            args.default_root_dir, 'log/'), name='mt5_summary')
+        trainer = Trainer.from_argparse_args(args,
+                                             logger=logger,
+                                             callbacks=[checkpoint_callback]
+                                             )
+        trainer.fit(model, data_model)
+    else:
+        trainer = Trainer.from_argparse_args(args)
+        model = MT5FinetuneSummary.load_from_checkpoint(
+            args.resume_from_checkpoint, args=args, num_data=len(data_model.predict_dataloader()))
+    result = trainer.predict(model, data_model)
+    if torch.distributed.get_rank() == 0:
+        save_test(result, args, data_model)
+
+
+if __name__ == '__main__':
+    main()
+    # test()
+
+'''
+python examples/mt5_summary.py --gpus=1 --test_data=test_public.jsonl
+--default_root_dir=/cognitive_comp/ganruyi/fengshen/mt5_summary/eval
+--do_eval_only
+--resume_from_checkpoint=/cognitive_comp/ganruyi/fengshen/mt5_summary/ckpt/model-epoch=01-train_loss=1.9166.ckpt
+--strategy=ddp
+'''
diff --git a/fengshen/examples/mt5_summary/pretrain_mt5_summary.sh b/fengshen/examples/mt5_summary/pretrain_mt5_summary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a77b88006211d6f7a432672f4ac29a58d9865d66
--- /dev/null
+++ b/fengshen/examples/mt5_summary/pretrain_mt5_summary.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+#SBATCH --job-name=mt5_large_summary
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4               # number of gpus
+#SBATCH -o /cognitive_comp/ganruyi/fengshen/mt5_large_summary/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/fengshen/mt5_large_summary/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=16
+ROOT_DIR=/cognitive_comp/ganruyi/fengshen/mt5_large_summary
+
+ZERO_STAGE=2
+
+config_json="$ROOT_DIR/ds_config.$SLURM_JOBID.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-5,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-5
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+# export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+TRAINER_ARGS="
+    --max_epochs 2 \
+    --gpus 4 \
+    --num_nodes 1 \
+    --strategy ddp \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+"
+DATA_DIR=/cognitive_comp/ganruyi/data_datasets_LCSTS_LCSTS/
+prompt="summary:"
+DATA_ARGS="
+    --data_dir $DATA_DIR
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data train.jsonl\
+    --valid_data valid.jsonl\
+    --test_data  valid.jsonl\
+    --prompt $prompt \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-large \
+    --output_save_path $ROOT_DIR/mt5_large_predict_lcsts.json \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --warmup 0.01 \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/fengshen/examples/mt5_summary.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+#singularity exec --nv -B /cognitive_comp/ganruyi/Megatron/:/cognitive_comp/ganruyi/Megatron/,/cognitive_comp/gaoxinyu/:/cognitive_comp/gaoxinyu/ $SINGULARITY_PATH python $CMD
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+clear; srun singularity exec --nv -B /cognitive_comp/ganruyi/:/cognitive_comp/ganruyi/ $SINGULARITY_PATH bash -c 'python $CMD'
\ No newline at end of file
diff --git a/fengshen/examples/pegasus/README.md b/fengshen/examples/pegasus/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f04b83c348ab2fe34a06a428523bc48169c7b478
--- /dev/null
+++ b/fengshen/examples/pegasus/README.md
@@ -0,0 +1,78 @@
+# 燃灯系列-Pegasus摘要模型预训练
+Pegasus预训练模型是专门为摘要任务而设计的预训练模型，相比于其它通用预训练模型，Pegasus 模型的架构设计更贴近下游的摘要任务，在摘要抽取的效果上的表现相比其他通用模型表现更好
+
+### 模型架构和参数
+Pegasus的模型架构是标准的encoder-decoder的Transformer结构，训练任务是用的是GSG（ Gap Sentences Generation）任务。GSG任务主要是通过对文本中的重要的句子进行mask，然后再通过decoder恢复。模型详细参数可看config.json
+
+1. base版本
+
+| 配置 | 参数 |
+| ---- | ---- |
+| encoder layers | 12 |
+| encoder_attention_heads | 12 |
+| encoder_ffn_dim | 3072 |
+| decoder layers | 12 |
+| decoder_attention_heads| 12 |
+| decoder_ffn_dim | 3072 |
+| max_encode_length | 512 |
+
+2. large 版本
+   
+| 配置 | 参数 |
+| ---- | ---- |
+| encoder layers | 16 |
+| encoder_attention_heads | 16 |
+| encoder_ffn_dim | 4096 |
+| decoder layers | 16 |
+| decoder_attention_heads| 16 |
+| decoder_ffn_dim | 4096 |
+| max_encode_length | 1024 |
+
+### 训练数据
+训练数据使用的是wudao 180g数据。数据进行了简单的预处理包括：
+1. 过滤过长单句（这样的句子通常会包括一些乱码句，无上下文语义的列表句、各种符号句，歌词句等）
+2. 过滤句子数过少文本，如句子数少于3句则抛弃
+
+### 模型
+
+pegasus-base: [Randeng_pegasus_238M_summary](https://huggingface.co./IDEA-CCNL/Randeng_Pegasus_238M_Summary) <br/>
+pegasus-large: [Randeng_pegasus_523M_summary](https://huggingface.co./IDEA-CCNL/Randeng_Pegasus_523M_Summary)
+
+主要文件：
+- tokenizers_pegasus.py 中文版pegasus的tokenize实现
+- pretrain_pegasus.py 模型预训练的核心实现文件
+- pretrain_pegasusu.sh 预训练脚本，具体参数可通过此脚本修改
+- data_utils.py 模型的一些工具代码
+
+#### 使用方式
+可直接通过Hugging face或者pytoch-ligthning框架调用。下面给出的例子是hugging face的调用方法：
+```python
+from transformers import PegasusForConditionalGeneration
+# Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
+# or you can mv download in tokenizers_pegasus.py and data_utils.py in https://huggingface.co./IDEA-CCNL/Randeng_Pegasus_238M_Summary/tree/main
+# Stronly recomend you git clone the Fengshenbang-LM repo:
+# 1. git clone https://github.com/IDEA-CCNL/Fengshenbang-LM
+# 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
+# and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
+from tokenizers_pegasus import PegasusTokenizer
+
+model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/randeng_pegasus_238M_summary")
+tokenizer = PegasusTokenizer.from_pretrained("path/to/vocab.txt")
+
+text = "在北京冬奥会自由式滑雪女子坡面障碍技巧决赛中，中国选手谷爱凌夺得银牌。祝贺谷爱凌！今天上午，自由式滑雪女子坡面障碍技巧决赛举行。决赛分三轮进行，取选手最佳成绩排名决出奖牌。第一跳，中国选手谷爱凌获得69.90分。在12位选手中排名第三。完成动作后，谷爱凌又扮了个鬼脸，甚是可爱。第二轮中，谷爱凌在道具区第三个障碍处失误，落地时摔倒。获得16.98分。网友：摔倒了也没关系，继续加油！在第二跳失误摔倒的情况下，谷爱凌顶住压力，第三跳稳稳发挥，流畅落地！获得86.23分！此轮比赛，共12位选手参赛，谷爱凌第10位出场。网友：看比赛时我比谷爱凌紧张，加油！"
+inputs = tokenizer(text, max_length=1024, return_tensors="pt")
+
+# Generate Summary
+summary_ids = model.generate(inputs["input_ids"])
+tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+```
+
+### 下游效果
+
+#### LCSTS摘要数据finetune后效果
+
+| model | rouge-1 | rouge-2 | rouge-L |
+| ---- | ---- | ---- | ---- |
+| Pegasus-base  | 44.13 | 31.31 | 41.06 | 
+| Pegasus-large | 49.42 | 37.91 | 46.63 |
+
diff --git a/fengshen/examples/pegasus/data_utils.py b/fengshen/examples/pegasus/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..879798749bc06d6857c01ec101baf5f3fb61d012
--- /dev/null
+++ b/fengshen/examples/pegasus/data_utils.py
@@ -0,0 +1,319 @@
+# -*- coding: utf-8 -*-
+
+import re
+import six
+import unicodedata
+import torch
+import rouge
+import numpy as np
+import random
+# from fengshen.examples.pegasus.pegasus_utils import text_segmentate
+import sys
+
+sys.path.append('../../../')
+
+rouge = rouge.Rouge()
+
+
+is_py2 = six.PY2
+
+if not is_py2:
+    basestring = str
+
+
+def _is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+        return True
+
+    return False
+
+
+def _is_whitespace(char):
+    """Checks whether `char` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `char` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `char` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
+            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+def is_string(s):
+    """判断是否是字符串
+    """
+    return isinstance(s, basestring)
+
+
+def is_stopwords(word, stopwords):
+    if word in stopwords:
+        return True
+    else:
+        return False
+
+
+def text_segmentate(text):
+    en_seg_pattern = '((?:\\!|\\?|\\.|\\n)+(?:\\s)+)'
+    ch_seg_pattern = '((?:？|！|。|\\n)+)'
+    try:
+        text = re.sub(en_seg_pattern, r'\1[SEP]', text)
+        # print("sub text: ", text)
+    except Exception as e:
+        print("input: ", text)
+        raise e
+    text = re.sub(ch_seg_pattern, r'\1[SEP]', text)
+    # print("sub ch text: ", text)
+    text_list = text.split("[SEP]")
+    text_list = list(filter(lambda x: len(x) != 0, text_list))
+    return text_list
+
+
+def load_stopwords(stopwords_path):
+    stopwords_dict = {}
+    with open(stopwords_path, "r") as rf:
+        for line in rf:
+            line = line.strip()
+            if line not in stopwords_dict:
+                stopwords_dict[line] = 0
+            else:
+                pass
+    return stopwords_dict
+
+
+def text_process(text, max_length):
+    """分割文本
+    """
+    texts = text_segmentate(text)
+
+    result, length = [], 0
+    for text in texts:
+        if length + len(text) > max_length * 1.3 and len(result) >= 3:
+            yield result
+            result, length = [], 0
+        result.append(text)
+        length += len(text)
+    if result and len(result) >= 3:
+        yield result
+
+
+def text_process_split_long_content(text, max_length):
+    """分割长文本
+    """
+    texts = text_segmentate(text)
+
+    result, sentence_num = "", 0
+    for text in texts:
+        if len(text) > 500:
+            if len(result) > 300 and sentence_num >= 3:
+                yield result
+                result, sentence_num = "", 0
+            else:
+                result, sentence_num = "", 0
+                continue
+        else:
+            if len(result) + len(text) > max_length * 1.1 and sentence_num >= 3:
+                yield result
+                result, sentence_num = "", 0
+            result += text
+            sentence_num += 1
+
+    if result and sentence_num >= 3:
+        yield result
+
+
+def gather_join(texts, idxs):
+    """取出对应的text，然后拼接起来
+    """
+    return ''.join([texts[i] for i in idxs])
+
+
+def gather_join_f1(texts_token, idsx):
+    join_texts = []
+    for id in idsx:
+        join_texts.extend(texts_token[id])
+    return join_texts
+
+
+def compute_rouge(source, target):
+    """计算rouge-1、rouge-2、rouge-l
+    """
+    source, target = ' '.join(source), ' '.join(target)
+    try:
+        scores = rouge.get_scores(hyps=source, refs=target)
+        return {
+            'rouge-1': scores[0]['rouge-1']['f'],
+            'rouge-2': scores[0]['rouge-2']['f'],
+            'rouge-l': scores[0]['rouge-l']['f'],
+        }
+    except ValueError:
+        return {
+            'rouge-1': 0.0,
+            'rouge-2': 0.0,
+            'rouge-l': 0.0,
+        }
+
+
+def remove_stopwords(texts, stopwords_dict):
+    for i, text in enumerate(texts):
+        texts[i] = list(filter(lambda x: x not in stopwords_dict, text))
+    return texts
+
+
+def pseudo_summary_f1(texts,
+                      stopwords,
+                      tokenizer,
+                      max_length,
+                      rouge_strategy="rouge-l"):
+    """构建伪标签摘要数据集
+    """
+    summary_rate = 0.25
+    max_length = max_length - 1
+    texts_tokens = []
+    sentece_idxs_vec = []
+    for text in texts:
+        if len(texts) == 0:
+            continue
+        try:
+            ids = tokenizer.encode(text.strip())[:-1]
+        except ValueError:
+            print("error, input : ", text)
+            raise ValueError
+        sentece_idxs_vec.append(ids)
+        tokens = [tokenizer._convert_id_to_token(token) for token in ids]
+        texts_tokens.append(tokens)
+
+    texts_tokens_rm = remove_stopwords(texts_tokens, stopwords)
+    source_idxs, target_idxs = list(range(len(texts))), []
+
+    assert len(texts_tokens) == len(texts)
+    # truncate_index = 0
+    while True:
+        sims = []
+        for i in source_idxs:
+            new_source_idxs = [j for j in source_idxs if j != i]
+            new_target_idxs = sorted(target_idxs + [i])
+            new_source = gather_join_f1(texts_tokens_rm, new_source_idxs)
+            new_target = gather_join_f1(texts_tokens_rm, new_target_idxs)
+            sim = compute_rouge(new_source, new_target)[rouge_strategy]
+            sims.append(sim)
+        new_idx = source_idxs[np.argmax(sims)]
+        del sims
+        source_idxs.remove(new_idx)
+        target_idxs = sorted(target_idxs + [new_idx])
+        source = gather_join(texts, source_idxs)
+        target = gather_join(texts, target_idxs)
+        try:
+            if (len(source_idxs) == 1
+                    or 1.0 * len(target) / len(source) > summary_rate):
+                break
+        except ZeroDivisionError as e:
+            print(e.meesage)
+            print(texts)
+            print("source: ", source)
+            print("target: ", target)
+
+    if len(source) < len(target):
+        source, target = target, source
+        source_idxs, target_idxs = target_idxs, source_idxs
+
+    return sentece_idxs_vec, source, target, source_idxs, target_idxs
+
+
+def get_input_mask(sentence_id_vec, indexs):
+    target_idxs = []
+    input_idxs = []
+    kMaskSentenceTokenId = 2
+    kEosTokenId = 1
+    mask_sentence_options_cumulative_prob = [0.9, 0.9, 1, 1]
+    for index in indexs:
+        target_idxs.extend(sentence_id_vec[index])
+        choice = random.uniform(0, 1)
+        if choice < mask_sentence_options_cumulative_prob[0]:
+            # print("mask index: ", index)
+            sentence_id_vec[index] = [kMaskSentenceTokenId]
+        elif choice < mask_sentence_options_cumulative_prob[1]:
+            # print("replace index: ", index)
+            replace_id = random.randint(0, len(sentence_id_vec))
+            sentence_id_vec[index] = sentence_id_vec[replace_id]
+        elif choice < mask_sentence_options_cumulative_prob[2]:
+            pass
+        else:
+            sentence_id_vec[index] = []
+
+    target_idxs.append(kEosTokenId)
+    # print(sentence_id_vec)
+    for index, sentence_id in enumerate(sentence_id_vec):
+        # print(index, sentence_id)
+        if len(sentence_id) == 0:
+            continue
+        input_idxs.extend(sentence_id_vec[index])
+
+    input_idxs.append(kEosTokenId)
+    return input_idxs, target_idxs
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int,
+                       decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def padding_to_maxlength(ids, max_length, pad_id):
+    cur_len = len(ids)
+    len_diff = max_length - cur_len
+    return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff
diff --git a/fengshen/examples/pegasus/pretrain_pegasus.py b/fengshen/examples/pegasus/pretrain_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..0059355f5d5bf6d149e01fc3dc15d3a760932733
--- /dev/null
+++ b/fengshen/examples/pegasus/pretrain_pegasus.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+
+
+from fengshen.models.model_utils import add_module_args
+from transformers import PegasusForConditionalGeneration, PegasusConfig
+from pytorch_lightning import Trainer, loggers, LightningModule
+from pytorch_lightning.callbacks import LearningRateMonitor
+from tokenizers_pegasus import PegasusTokenizer
+from utils import UniversalCheckpoint
+from data.universal_datamodule import UniversalDataModule
+from data_utils import (
+    get_input_mask, pseudo_summary_f1, shift_tokens_right,
+    padding_to_maxlength, load_stopwords, text_segmentate)
+import argparse
+import torch
+import os
+import sys
+
+sys.path.append('../../')
+
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = '6'
+
+
+class FakeAbstractCollator:
+
+    def __init__(self, tokenizer, stopwords_dict, max_enc_length):
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_enc_length
+        self.stopwords_dict = stopwords_dict
+
+    def __call__(self, samples):
+        # print("samples: ", samples)
+        labels = []
+        attn_mask = []
+        decoder_attn_mask = []
+        source_inputs = []
+
+        for text in samples:
+            texts = text["chunks"]
+            text = text_segmentate(texts)
+            sentence_id_vec, source, target, source_idxs, target_idxs = pseudo_summary_f1(
+                text, self.stopwords_dict, self.tokenizer, self.max_seq_length,
+                "rouge-l")
+            source_idxs, target_idxs = get_input_mask(sentence_id_vec,
+                                                      target_idxs)
+            if len(source_idxs) > self.max_seq_length:
+                if 2 not in source_idxs[self.max_seq_length - 1:]:
+                    source_idxs = source_idxs[:self.max_seq_length]
+                    source_idxs[-1] = self.tokenizer.eos_token_id
+                    sys.stderr.write("Warning split long line: " + source +
+                                     "\n")
+                else:
+                    continue
+
+            source_idxs, attention_mask = padding_to_maxlength(
+                source_idxs, self.max_seq_length, self.tokenizer.pad_token_id)
+            label, target_attention_mask = padding_to_maxlength(
+                target_idxs, self.max_seq_length, self.tokenizer.pad_token_id)
+            # print("sample len: ", len(source_idxs))
+            source_inputs.append(source_idxs)
+            attn_mask.append(attention_mask)
+            decoder_attn_mask.append(target_attention_mask)
+            labels.append(label)
+        labels = torch.tensor(labels)
+        decode_input_idxs = shift_tokens_right(labels,
+                                               self.tokenizer.pad_token_id,
+                                               self.tokenizer.pad_token_id)
+        end_token_index = torch.where(labels == self.tokenizer.eos_token_id)[1]
+        for idx, end_idx in enumerate(end_token_index):
+            labels[idx][end_idx + 1:] = -100
+
+        # print("call samples: ")
+        return {
+            "input_ids": torch.tensor(source_inputs),
+            "attention_mask": torch.tensor(attn_mask),
+            "labels": labels,
+            "decoder_input_ids": decode_input_idxs,
+            "decoder_attention_mask": torch.tensor(decoder_attn_mask)
+        }
+
+
+class PegasusChineseModel(LightningModule):
+
+    def __init__(self, args, **kwargs):
+        super().__init__()
+        self.args = args
+        self.save_hyperparameters(args)
+        config = PegasusConfig.from_json_file(
+            os.path.join(args.model_path, "config.json"))
+        print("vocab_size: ", config.vocab_size)
+        self.model = PegasusForConditionalGeneration(config=config)
+        print("model.num_parameters: ", self.model.num_parameters())
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader(
+            )
+
+            # Calculate total steps
+            tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus)
+            ab_size = self.trainer.accumulate_grad_batches * float(
+                self.trainer.max_epochs)
+            self.total_steps = (len(train_loader.dataset) //
+                                tb_size) // ab_size
+            print('Total training step:', self.total_steps)
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        self.log('train_loss', output.loss, sync_dist=True)
+        return output.loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1, ))
+        y_true = labels.view(size=(-1, )).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float()) / labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss, sync_dist=True)
+        self.log('val_acc', acc, sync_dist=True)
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        if self.trainer._accelerator_connector.cluster_environment.global_rank(
+        ) == 0:
+            self.model.save_pretrained(
+                os.path.join(
+                    self.trainer.checkpoint_callback.dirpath,
+                    'hf_pretrained_epoch{}_step{}'.format(
+                        checkpoint['epoch'], checkpoint['global_step'])))
+
+
+def main():
+    args_parser = argparse.ArgumentParser("Pegasus Task")
+
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args_parser = add_module_args(args_parser)
+    args_parser.add_argument('--deepspeed')
+    args_parser.add_argument(
+        '--stopword_path',
+        default="/cognitive_comp/dongxiaoqun/project/pegasus/own/pegasus/stopwords",
+        type=str)
+    args_parser.add_argument('--max_seq_length', default=1024, type=int)
+    args = args_parser.parse_args()
+
+    tokenizer = PegasusTokenizer.from_pretrained(args.model_path)
+    stopwords_dict = load_stopwords(args.stopword_path)
+    collator = FakeAbstractCollator(tokenizer, stopwords_dict,
+                                    args.max_seq_length)
+    data_module = UniversalDataModule(tokenizer=tokenizer,
+                                      args=args,
+                                      collate_fn=collator)
+    module = PegasusChineseModel(args)
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    logger = loggers.TensorBoardLogger(
+        save_dir=os.path.join(args.default_root_dir, 'logs/'),
+        name=os.path.basename(os.path.dirname(args.model_path)))
+    checkpoint_callback = UniversalCheckpoint(args).callbacks
+
+    # autotuning
+    if args.deepspeed is not None:
+        os.environ['PL_DEEPSPEED_CONFIG_PATH'] = args.deepspeed
+
+    trainer = Trainer.from_argparse_args(
+        args, logger=logger, callbacks=[lr_monitor, checkpoint_callback])
+
+    trainer.fit(module, data_module)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/pegasus/pretrain_pegasus.sh b/fengshen/examples/pegasus/pretrain_pegasus.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3a371ac45463317fa01fa84a72f5df6bb9ca6bd5
--- /dev/null
+++ b/fengshen/examples/pegasus/pretrain_pegasus.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+#SBATCH --job-name=pegasus-base_last # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+
+
+set -x -e
+
+echo "START TIME: $(date)"
+MODEL_NAME=pegasus-base_test
+
+config_json="./$MODEL_NAME.ds_config.json"
+export MASTER_PORT=$[RANDOM%10000+40000]
+
+MICRO_BATCH_SIZE=4
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+    "zero_optimization": {
+        "stage": 1
+    },
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "params": {
+            "betas": [
+                0.9,
+                0.999
+            ],
+            "eps": 1e-08,
+            "lr": 1e-04,
+            "weight_decay": 0.01
+        },
+        "type": "Adam"
+    },
+    "scheduler": {
+        "params": {
+            "warmup_max_lr": 1e-04,
+            "warmup_min_lr": 1e-05,
+            "total_num_steps": 80000000,
+            "warmup_num_steps" : 50000
+        },
+        "type": "WarmupDecayLR"
+    },
+    "steps_per_print": 100,
+    "gradient_clipping": 1,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+    "zero_allow_untested_optimizer": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions
+
+DATA_ARGS="\
+        --datasets_name wudao_180g_512 \
+        --num_workers 20 \
+        --train_batchsize $MICRO_BATCH_SIZE \
+        --val_batchsize 8 \
+        --test_batchsize 8  \
+        --max_seq_length 512 \
+        --val_datasets_field valid \
+        "
+
+MODEL_ARGS="\
+        --model_path /cognitive_comp/dongxiaoqun/pretrained_model/pegasus-base/ \
+        --learning_rate 1e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.001 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor train_loss \
+        --save_top_k 3 \
+        --mode min \
+        --every_n_train_steps 200 \
+        --dirpath /cognitive_comp/dongxiaoqun/train_model/fengshen-$MODEL_NAME_debug/ckpt \
+        --filename model-{step:02d}-{train_loss:.4f} \
+        --save_last \
+        "
+
+TRAINER_ARGS="\
+        --gradient_clip_val 1.0 \
+        --max_epochs 1 \
+        --gpus 2 \
+        --num_nodes 1 \
+        --strategy ddp \
+        --log_every_n_steps 100 \
+        --val_check_interval 0.1 \
+        --accumulate_grad_batches 8 \
+        --default_root_dir /cognitive_comp/dongxiaoqun/train_model/fengshen-$MODEL_NAME_debug \
+        --stopword_path /cognitive_comp/dongxiaoqun/pretrained_model/pegasus-large/stopwords \
+        "
+
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+SINGULARITY_PATH=/cognitive_comp/dongxiaoqun/software/docker/pytorch21_06_py3_docker_image_v2.sif
+export SCRIPT_PATH=/cognitive_comp/dongxiaoqun/project/idea-ccnl/bug_fix/Fengshenbang-LM/fengshen/examples/pegasus/pretrain_pegasus.py
+
+# python $SCRIPT_PATH $options
+source activate
+conda activate torchnew
+srun --nodes=1 --ntasks-per-node=1 --gres=gpu:2 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=226191 bash -c 'python3 $SCRIPT_PATH $options'
diff --git a/fengshen/examples/pegasus/tokenizers_pegasus.py b/fengshen/examples/pegasus/tokenizers_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..f532875987b59a42aca9ad35eb7a1945c992869b
--- /dev/null
+++ b/fengshen/examples/pegasus/tokenizers_pegasus.py
@@ -0,0 +1,597 @@
+from fengshen.examples.pegasus.data_utils import (
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+    _is_chinese_char)
+from transformers import PreTrainedTokenizer
+from transformers import logging
+from typing import List, Optional, Tuple, Union
+import collections
+import os
+import unicodedata
+import re
+import jieba
+import sys
+
+sys.path.append("../../../../")
+
+jieba.dt.tmp_dir = os.path.expanduser("~/.cache/")
+# jieba.enable_parallel(8)
+jieba.initialize()
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class PegasusTokenizer(PreTrainedTokenizer):
+    # copy from BertTokenizer
+    r"""
+    Construct a Pegasus tokenizer. Based on WordPiece.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    #     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    #     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    #     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 do_basic_tokenize=True,
+                 never_split=None,
+                 pad_token="<pad>",
+                 eos_token="</s>",
+                 unk_token="<unk>",
+                 mask_token="<mask_2>",
+                 mask_token_sent="<mask_1>",
+                 additional_special_tokens=None,
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 tokenize_chinese_chars=True,
+                 strip_accents=None,
+                 offset=100,
+                 pre_tokenizer=lambda x: jieba.cut(x, HMM=False),
+                 **kwargs):
+        self.offset = offset
+
+        if additional_special_tokens is not None:
+            if not isinstance(additional_special_tokens, list):
+                raise TypeError(
+                    f"additional_special_tokens should be of type {type(list)}, \
+                     but is {type(additional_special_tokens)}"
+                )
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                and mask_token_sent is not None else additional_special_tokens)
+
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(
+                    len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(
+                    additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens \
+                        do not contain an incorrectly shifted list of <unk_x> tokens. \
+                        Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [
+                mask_token_sent
+            ] if mask_token_sent is not None else []
+            # additional_special_tokens += [f"<unk_{i}>" for i in range(3, self.offset)]
+
+        # print("additional_special_tokens: ", additional_special_tokens)
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. \
+                To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            additional_special_tokens=additional_special_tokens,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        self.pre_tokenizer = pre_tokenizer
+        self.mask_token_sent = mask_token_sent
+        self.vocab = load_vocab(vocab_file)
+
+        self.vocab[self.eos_token] = self.vocab.pop("[unused1]")
+        # self.vocab[self.eos_token] = self.vocab.pop("[unused2]")
+        self.vocab[self.pad_token] = self.vocab.pop("[PAD]")
+        self.vocab[self.unk_token] = self.vocab.pop("[UNK]")
+
+        if self.mask_token_sent is not None:
+            self.vocab[self.mask_token] = self.vocab.pop("[unused3]")
+            self.vocab[self.mask_token_sent] = self.vocab.pop("[unused2]")
+
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                      unk_token=self.unk_token)
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        # print("pegasus_tokenizer: ", text)
+        for text in self.pre_tokenizer(text):
+            if text in self.vocab:
+                split_tokens.append(text)
+            else:
+                if self.do_basic_tokenize:
+                    for token in self.basic_tokenizer.tokenize(
+                            text, never_split=self.all_special_tokens):
+
+                        # If the token is part of the never_split set
+                        if token in self.basic_tokenizer.never_split:
+                            split_tokens.append(token)
+                        else:
+                            split_tokens += self.wordpiece_tokenizer.tokenize(
+                                token)
+                else:
+                    split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    @staticmethod
+    def _cjk_punctuation():
+        return u'\uff02\uff03\uff04\uff05\uff06\uff07\uff08\uff09\uff0a\uff0b\uff0c\uff0d\uff0f\uff1a\uff1b\uff1c\uff1d\
+            \uff1e\uff20\uff3b\uff3c\uff3d\uff3e\uff3f\uff40\uff5b\uff5c\uff5d\uff5e\uff5f\uff60\uff62\
+            \uff63\uff64\u3000\u3001\u3003\u3008\u3009\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011\u3014\
+            \u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u3030\u303e\u303f\u2013\u2014\
+            \u2018\u2019\u201b\u201c\u201d\u201e\u201f\u2026\u2027\ufe4f\ufe51\ufe54\u00b7\uff01\uff1f\uff61\u3002'
+
+    def convert_ids_to_tokens(
+            self,
+            ids: Union[int, List[int]],
+            skip_special_tokens: bool = False) -> Union[str, List[str]]:
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+        added tokens.
+        Args:
+            ids (`int` or `List[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+        Returns:
+            `str` or `List[str]`: The decoded token(s).
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids and index != 2:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # for token in
+        # tokens = tokens or self.ids_to_tokens(ids)
+        # tokens = [token for token in tokens if not self._is_special(token)]
+
+        text = ''
+        for i, token in enumerate(tokens):
+            if token[:2] == '##':
+                text += token[2:]
+            elif len(token) == 1 and _is_chinese_char(ord(token)):
+                text += token
+            elif len(token) == 1 and _is_punctuation(token):
+                text += token
+                text += ' '
+            elif i > 0 and _is_chinese_char(ord(text[-1])):
+                text += token
+            elif tokens == "</s>":
+                continue
+            else:
+                text += ' '
+                text += token
+
+        text = re.sub(' +', ' ', text)
+        text = re.sub('\' (re|m|s|t|ve|d|ll) ', '\'\\1 ', text)
+        punctuation = re.sub(' +', '', self._cjk_punctuation()).strip() + '+-/={(<['
+        punctuation_regex = '|'.join([re.escape(p) for p in punctuation])
+        punctuation_regex = '(%s) ' % punctuation_regex
+        text = re.sub(punctuation_regex, '\\1', text)
+        text = re.sub(r'(\d\.) (\d)', '\\1\\2', text)
+
+        return text.strip()
+        # out_string = " ".join(tokens).replace(" ##", "").strip()
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
+        and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence:
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(
+            self.all_special_ids)  # call it once instead of inside list comp
+        # all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [self.eos_token_id]
+        else:
+            return self._special_token_mask(token_ids_0 +
+                                            token_ids_1) + [self.eos_token_id]
+
+    def num_special_tokens_to_add(self, pair=False):
+        """Just EOS"""
+        return 1
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") +
+                VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = (filename_prefix +
+                          "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(),
+                                             key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!")
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file, )
+
+
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=None,
+                 tokenize_chinese_chars=True,
+                 strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(
+            set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+                or (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
diff --git a/fengshen/examples/pretrain_bert/README.md b/fengshen/examples/pretrain_bert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1761095920188083853fb3df47927f0f9c008b76
--- /dev/null
+++ b/fengshen/examples/pretrain_bert/README.md
@@ -0,0 +1,78 @@
+# Bert预训练
+
+## 背景
+
+我们有持续收集了一部分语料，有一套自建的数据处理流程。位了验证数据处理的效果，从零开始预训练了2个base级别的Bert模型，一个是基于自建数据，一个是基于同行们开源的数据。总体来说数据效果差别不大，下面只介绍一下本次预训练的流程。
+
+## 数据处理
+
+我们的原始语料主要源自common crawl以及一些开源的高质量语料，经过一些列的数据清洗之后，我们的数据格式为jsonline。例如（摘自内部数据）：
+```json
+{"text":"据悉,河南博物馆成立于1927年,目前拥有超过170000件(套)的文物收藏,包括Jiahu骨笛,雌性猫头鹰雕像,cloud-patterned铜禁,Duling Fangding,莲花和起重机广场,和玉柄剑,黄金从武则天滑落,四神云雾壁画和汝窑天蓝釉雕鹅颈瓶是九大镇厅的珍品。院中的藏品以史前文物、商周青铜器、陶瓷、玉器和石雕等为特色。高质量文物数量多、品种齐全、品位高、价值高。它们是见证中国文明发展、展示中国历史发展的文化艺术宝库。"}
+{"text": "功夫不负有心人，1925年，万氏兄弟试制动画片初获成果，并获得了商务印书馆的大力支持。其后兄弟们再接再厉，直到1927年，一部黑白无声动画片《大闹画室》诞生了爱尔兰风笛。据《申报》记载，“该片内容画人与真人合作锁梦楼，滑稽处甚多，令人观后，捧腹不止。”此片曾远销美国放映，并大受赞誉。1930年夏俊娜，万古蟾到大中华百合影片公司工作，万氏兄弟采用了同样的手法拍摄了第二部动画短片《纸人捣乱记》，并于1931年上映。"}
+```
+
+处理脚本路径：`/cognitive_comp/wuziwei/codes/Fengshenbang-LM/fengshen/data/bert_dataloader`
+
+该路径下面有3个文件，`auto_split.sh`和`preprocessing.py`是原始数据预处理的脚本，`load.py是fs_data`的处理脚本，执行顺序如下：
+
+#### step 1
+
+执行`auto_split.sh`文件，作用是分割大文件，超过1GB的文件，会自动分割未300M的小文件。使用方法如下：
+
+`sh auto_split.sh 你的数据文件路径`
+
+#### step 2
+
+执行`preprocessing.py`文件，该文件的作用主要是分句，为什么不嵌入到collate_fn中做，是发现那样效率会慢一些，所以单独拿出来做了。
+执行`python preprocessing.py`即可，注意修改脚本内的文件路径。
+
+#### step 3
+
+`load.py`文件是用fsdata的方式加载数据集，也是执行即可。执行一遍，后续的加载可以实现180GB的数据秒入～
+
+前面两步是为了提高load.py文件生成缓存文件的速度。经过这几步的处理以及collate_fn函数（bert mask 策略的实现），最终变成bert的输入。如下：
+
+*ps: collate_fn在`Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.py`脚本下，由DataCollate类实现。*
+
+```json
+{
+"input_ids": torch.tensor(input_ids),
+"labels": torch.tensor(batch_labels),
+"attention_mask": torch.tensor(attention_mask),
+"token_type_ids": torch.tensor(token_type_ids)
+}
+```
+
+## 模型结构
+
+模型结构即为标准的bert-base，即：
+|    配置     | 参数  |
+| :---------: | :---: |
+|   nlayers   |  12   |
+|  nheaders   |  12   |
+| hidden-size | 768  |
+| seq-length  | 512  |
+| vocab-size  | 21128  |
+
+## 任务以及Mask策略
+
+*mask策略的实现在`Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.py`的**DataCollate**类中*
+
+本次预训练取消了NSP任务，只做mask任务，具体mask策略如下：
+
+- 15%随机mask
+    - 80% mask
+    - 10% 随机替换
+    - 10% 保持不变
+- 全词mask （wwm）
+- n-gram mask
+
+由于加入了全词mask和n-gram mask 总体的mask token数量会比英文原始论文的mask比例略高
+
+## 预训练执行流程
+
+- 训练框架：[Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM)
+- 脚本执行：`sh Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.sh`
+
+*具体配置见`Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.sh`*
diff --git a/fengshen/examples/pretrain_bert/pretrain_bert.py b/fengshen/examples/pretrain_bert/pretrain_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a07d7020e10503c4a2b15cfa8456de3264bd13f4
--- /dev/null
+++ b/fengshen/examples/pretrain_bert/pretrain_bert.py
@@ -0,0 +1,278 @@
+from data.bert_dataloader.load import BertDataModule
+from transformers import (
+    BertTokenizer,
+    BertConfig,
+    BertForPreTraining,
+    BertModel,
+    BertForMaskedLM
+)
+from pytorch_lightning import (
+    LightningDataModule,
+    LightningModule,
+    loggers,
+    Trainer,
+)
+from pytorch_lightning.callbacks import (
+    ModelCheckpoint,
+    LearningRateMonitor,
+)
+from typing import Optional
+from torch.utils.data import DataLoader
+from transformers.optimization import get_linear_schedule_with_warmup
+import argparse
+import sys
+import torch
+import os
+import re
+import jieba
+import numpy as np
+
+# 如果没有安装fengshen模块，请把Fengshenbang-LM/fengshen加入到系统环境变量
+sys.path.insert(0, '../../../fengshen')
+
+os.environ["CUDA_VISIBLE_DEVICES"] = '0,1'
+
+
+class DataCollate(object):
+
+    def __init__(self, tokenizer, max_length, mask_rate=0.15, max_ngram=3, if_padding=True) -> None:
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.word_cuter = jieba.cut
+        self.vocab_length = len(tokenizer)
+        self.mask_rate = mask_rate
+        self.ignore_labels = -100
+        self.ngrams = np.arange(1, max_ngram + 1, dtype=np.int64)
+        pvals = 1. / np.arange(1, max_ngram + 1)
+        pvals /= pvals.sum(keepdims=True)  # p(n) = 1/n / sigma(1/k)
+        self.pvals = pvals
+        self.padding = if_padding
+
+    def token_process(self, token_id):
+        rand = np.random.random()
+        if rand <= 0.8:
+            return self.tokenizer.mask_token_id
+        elif rand <= 0.9:
+            return token_id
+        else:
+            return np.random.randint(1, self.vocab_length)
+
+    def __call__(self, samples):
+        input_ids = []
+        attention_mask = []
+        token_type_ids = []
+        batch_labels = []
+        # print('^-^ batch size :',len(samples))
+        for sample in samples:
+            word_list = list(self.word_cuter(sample['text']))
+            mask_ids, labels = [], []
+
+            record = []
+            for i in range(len(word_list)):
+                rands = np.random.random()
+                if i in record:
+                    continue
+                word = word_list[i]
+                if rands > self.mask_rate and len(word) < 4:
+                    word = word_list[i]
+                    word_encode = tokenizer.encode(word, add_special_tokens=False)
+                    for token in word_encode:
+                        mask_ids.append(token)
+                        labels.append(self.ignore_labels)
+                    record.append(i)
+                else:
+                    n = np.random.choice(self.ngrams, p=self.pvals)
+                    for index in range(n):
+                        ind = index + i
+                        if ind in record or ind >= len(word_list):
+                            continue
+                        record.append(ind)
+                        word = word_list[ind]
+                        word_encode = tokenizer.encode(word, add_special_tokens=False)
+                        for token in word_encode:
+                            mask_ids.append(self.token_process(token))
+                            labels.append(token)
+            if self.padding:
+                if len(mask_ids) > self.max_length:
+                    input_ids.append(mask_ids[:self.max_length])
+                    batch_labels.append(labels[:self.max_length])
+                else:
+                    lenght = len(mask_ids)
+                    mask_ids.extend([0]*(self.max_length-lenght))
+                    labels.extend([-100]*(self.max_length-lenght))
+                    input_ids.append(mask_ids)
+                    batch_labels.append(labels)
+            attention_mask.append([1]*self.max_length)
+            token_type_ids.append([0]*self.max_length)
+
+        #     print('sentence:',sample['text'])
+        #     print('input_ids:',mask_ids)
+        #     print('decode inputids:',self.tokenizer.decode(mask_ids))
+        #     print('labels',labels)
+        #     print('decode labels:',self.tokenizer.decode(labels))
+        #     print('*'*20)
+        return {
+            'input_ids': torch.tensor(input_ids),
+            'labels': torch.tensor(batch_labels),
+            'attention_mask': torch.tensor(attention_mask),
+            'token_type_ids': torch.tensor(token_type_ids)
+        }
+
+
+class Bert(LightningModule):
+    @staticmethod
+    def add_module_specific_args(args_parser):
+        parser = args_parser.add_argument_group('Bert')
+        parser.add_argument('--model_path', type=str, default='')
+        parser.add_argument('--learning_rate', default=1e-5, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        return args_parser
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.bertconfig = BertConfig.from_pretrained(args.model_path)
+        # self.model = BertForPreTraining(self.bertconfig)
+        self.model = BertForMaskedLM(self.bertconfig)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus)
+            ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
+            self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size
+
+    def configure_optimizers(self):
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.hparams.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.hparams.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_steps * self.hparams.warmup),
+            self.total_steps)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        # print(output)
+        self.log('train_loss', output.loss)
+        return output.loss
+
+    def comput_metrix(self, logits, labels):
+        ones = torch.ones_like(labels)
+        zero = torch.zeros_like(labels)
+        mask = torch.where(labels < 0, zero, ones)
+        mask = mask.view(size=(-1,)).float()
+        # y_true=labels.view(size=(-1,)).float()
+
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        corr = torch.multiply(corr.float(), mask)
+        acc = torch.sum(corr.float()) / torch.sum(mask)
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        # print(output)
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        print('val_loss ', output.loss)
+        self.log('val_loss', output.loss)
+        self.log('val_acc', acc)
+        # pass
+
+    def predict_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        return output.prediction_logits
+
+
+class CustomCKPT:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('ckpt call back')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./ckpt/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+        parser.add_argument('--save_last', action='store_true', default=True)
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', action='store_true', default=False)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename,
+                                         save_last=args.save_last)
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = BertDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = Bert.add_module_specific_args(args_parser)
+    args_parser = CustomCKPT.add_argparse_args(args_parser)
+    args_parser.add_argument('--deepspeed')
+    args_parser.add_argument('--seq_max_length')
+
+    args = args_parser.parse_args()
+
+    tokenizer = BertTokenizer.from_pretrained(args.model_path)
+    collate_fn = DataCollate(tokenizer, 512)
+    data_module = BertDataModule(tokenizer=tokenizer, args=args, collate_fn=collate_fn)
+
+    print('data load complete')
+
+    model = Bert(args)
+    print('model load complete')
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+        args.default_root_dir, 'logs/'),
+        name=os.path.basename(os.path.dirname(args.model_path)))
+    checkpoint_callback = CustomCKPT(args).callbacks
+
+    if args.resume_from_checkpoint is not None and \
+            not os.path.exists(args.resume_from_checkpoint):
+        print('--------warning no checkpoint found--------, remove args')
+        del args.resume_from_checkpoint
+
+    # autotuning
+    if args.deepspeed is not None:
+        os.environ['PL_DEEPSPEED_CONFIG_PATH'] = args.deepspeed
+
+    trainer = Trainer.from_argparse_args(args, logger=logger,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    trainer.fit(model, data_module)
diff --git a/fengshen/examples/pretrain_bert/pretrain_bert.sh b/fengshen/examples/pretrain_bert/pretrain_bert.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6e6453826d1c6408de4a7e064a7756529b0c6cd
--- /dev/null
+++ b/fengshen/examples/pretrain_bert/pretrain_bert.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_bart # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+
+MODEL_NAME=bert-1.3B
+
+config_json="./$MODEL_NAME.ds_config.json"
+((MASTER_PORT=$RANDOM%10000+40000))
+echo $MASTER_PORT
+ZERO_STAGE=2
+MICRO_BATCH_SIZE=16
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+    "zero_optimization": {
+        "stage": $ZERO_STAGE,
+        "contiguous_gradients": true,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "allgather_bucket_size": 2e8
+    },
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "params": {
+            "betas": [
+                0.9,
+                0.999
+            ],
+            "eps": 1e-08,
+            "lr": 1e-04,
+            "weight_decay": 0.01
+        },
+        "type": "Adam"
+    },
+    "scheduler": {
+        "params": {
+            "warmup_max_lr": 1e-04,
+            "warmup_min_lr": 1e-05,
+            "total_num_steps": 536877,
+            "warmup_num_steps" : 50000
+        },
+        "type": "WarmupDecayLR"
+    },
+    "steps_per_print": 100,
+    "gradient_clipping": 1,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+    "zero_allow_untested_optimizer": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/home/wuziwei/torch_extendsions
+
+DATA_ARGS="\
+        --datasets_name wudao_180g \
+        --num_workers 16 \
+        --train_batchsize $MICRO_BATCH_SIZE 
+        "
+
+MODEL_ARGS="\
+        --model_path /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/wudao180g_bert_base \
+        --learning_rate 1e-5 \
+        --weight_decay 0.01 \
+        --warmup 0.001 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor train_loss \
+        --save_top_k 3 \
+        --mode min \
+        --save_last \
+        --every_n_train_steps 5000 \
+        --dirpath /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/$MODEL_NAME \
+        --filename model-{step:02d}-{train_loss:.4f} \
+        "
+TRAINER_ARGS="\
+        --gradient_clip_val 1.0 \
+        --max_epochs 1 \
+        --gpus 2 \
+        --num_nodes 1 \
+        --strategy ddp \
+        --log_every_n_steps 100 \
+        --val_check_interval 0.1 \
+        --check_val_every_n_epoch 1 \
+        --accumulate_grad_batches 1 \
+        --resume_from_checkpoint /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/$MODEL_NAME/last.ckpt \
+        --default_root_dir /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/$MODEL_NAME \
+        "
+
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+export SCRIPT_PATH=/data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/pretrain_bert.py
+
+bash -c 'python3 $SCRIPT_PATH $options'
+
diff --git a/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen.py b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen.py
new file mode 100644
index 0000000000000000000000000000000000000000..1487abb15a7419b6c00056b6fcd78e96c8125d8b
--- /dev/null
+++ b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen.py
@@ -0,0 +1,237 @@
+from dataclasses import dataclass
+from transformers import (
+    MegatronBertConfig,
+    MegatronBertForPreTraining,
+    AutoTokenizer,
+)
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+import argparse
+import torch
+import os
+import numpy as np
+import time
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.data.data_utils.sop_utils import get_a_and_b_segments
+from fengshen.data.data_utils.truncate_utils import truncate_segments
+from fengshen.data.data_utils.token_type_utils import create_tokens_and_tokentypes
+from fengshen.data.data_utils.mask_utils import create_masked_lm_predictions
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from torch.utils.data._utils.collate import default_collate
+
+SHOW_DATA = False
+
+
+@dataclass
+class ErLangShenCollator:
+    '''
+    由input处理成samples，也就是最终模型的输入
+    其中主要处理逻辑在__call__里
+    包含Mask和Sop任务
+    '''
+    tokenizer: None  # 分词
+    max_seq_length: 512
+    masked_lm_prob: 0.15
+    content_key: str = 'text'
+    # 一些预处理操作
+
+    def setup(self):
+        from fengshen.data.data_utils.sentence_split import ChineseSentenceSplitter
+        self.sentence_split = ChineseSentenceSplitter()
+        self.np_rng = np.random.RandomState(seed=((int(time.time()) % 2**32)))
+        inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()}
+        self.vocab_id_list = list(inv_vocab.keys())
+        self.vocab_id_to_token_dict = inv_vocab
+
+    def __call__(self, samples):
+        '''
+        samples: 一个sample长这样{"text": "hello world"}
+        '''
+        model_inputs = []
+        for s in samples:
+            sentences = self.sentence_split.tokenize(s[self.content_key])
+            # Divide sample into two segments (A and B).
+            tokenized_sentences = [self.tokenizer.convert_tokens_to_ids(
+                self.tokenizer.tokenize(sent)) for sent in sentences]
+            if len(tokenized_sentences) == 0:
+                print('find empty sentence')
+                continue
+            if len(tokenized_sentences) > 1:
+                tokens_a, tokens_b, is_next_random = get_a_and_b_segments(tokenized_sentences,
+                                                                          self.np_rng)
+            else:
+                tokens_a = tokenized_sentences[0]
+                tokens_b = []
+                is_next_random = False
+            # max_seq_length - 3因为还需要拼上[CLS] [SEP] [SEP]
+            if len(tokens_a) == 0:
+                continue
+            _ = truncate_segments(tokens_a, tokens_b, len(tokens_a),
+                                  len(tokens_b), self.max_seq_length-3, self.np_rng)
+            # Build tokens and toketypes.
+            tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
+                                                              self.tokenizer.cls_token_id, self.tokenizer.sep_token_id)
+            # Masking.
+            max_predictions_per_seq = self.masked_lm_prob * len(tokens)
+            (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions(
+                tokens, self.vocab_id_list, self.vocab_id_to_token_dict, self.masked_lm_prob,
+                self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id,
+                max_predictions_per_seq, self.np_rng,
+                masking_style='bert')
+
+            # Some checks.
+            num_tokens = len(tokens)
+            padding_length = self.max_seq_length - num_tokens
+            assert padding_length >= 0
+            assert len(tokentypes) == num_tokens
+            assert len(masked_positions) == len(masked_labels)
+
+            # Tokens and token types.
+            filler = [self.tokenizer.pad_token_id] * padding_length
+            tokens_np = np.array(tokens + filler, dtype=np.int64)
+            tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+            # Padding mask.
+            padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                                       dtype=np.int64)
+
+            # Lables and loss mask.
+            labels = [-100] * self.max_seq_length
+            for i in range(len(masked_positions)):
+                assert masked_positions[i] < num_tokens
+                labels[masked_positions[i]] = masked_labels[i]
+            labels_np = np.array(labels, dtype=np.int64)
+            model_inputs.append(
+                {
+                    'input_ids': tokens_np,
+                    'attention_mask': padding_mask_np,
+                    'token_type_ids': tokentypes_np,
+                    'labels': labels_np,
+                    'next_sentence_label': int(is_next_random)
+                }
+            )
+        return default_collate(model_inputs)
+
+
+class ErLangShenBert(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Erlangshen Bert')
+        parser.add_argument('--masked_lm_prob', type=float, default=0.15)
+        parser.add_argument('--max_seq_length', type=int, default=512)
+        parser.add_argument('--sample_content_key', type=str, default='text')
+        return parent_parser
+
+    def __init__(self, args, tokenizer, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+        config = MegatronBertConfig.from_pretrained(args.model_path)
+        self.config = config
+        self.tokenizer = tokenizer
+        self.model = MegatronBertForPreTraining(config)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def forward(self, **batch):
+        return self.model(**batch)
+
+    def detokenize(self, token_ids):
+        toks = self.tokenizer.convert_ids_to_tokens(token_ids)
+        return self.tokenizer.convert_tokens_to_string(toks)
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.shape[0]
+        return acc
+
+    def training_step(self, batch, batch_idx):
+        if self.trainer.global_rank == 0:
+            global SHOW_DATA
+            if not SHOW_DATA:
+                print(self.config)
+                print(self.model)
+                SHOW_DATA = True
+                print('source: {}'.format(batch['input_ids'][0]))
+                print('target: {}'.format(batch['labels'][0]))
+                print('source: {}'.format(self.detokenize(batch['input_ids'][0])))
+                label_idx = batch['labels'][0] != -100
+                print('target: {}'.format(self.detokenize(
+                    batch['labels'][0][label_idx])))
+        output = self(**batch)
+        self.log('train_loss', output.loss, sync_dist=True)
+        label_idx = batch['labels'] != -100
+        acc = self.comput_metrix(
+            output.prediction_logits[label_idx].view(-1, output.prediction_logits.size(-1)), batch['labels'][label_idx])
+        self.log('train_acc', acc, sync_dist=True)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        output = self(**batch)
+        self.log('val_loss', output.loss, sync_dist=True)
+        return output.loss
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = ErLangShenBert.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    collate_fn = ErLangShenCollator(
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        masked_lm_prob=args.masked_lm_prob,
+        content_key=args.sample_content_key,
+    )
+    collate_fn.setup()
+    data_module = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collate_fn)
+    print('data load complete')
+
+    model = ErLangShenBert(args, tokenizer=tokenizer)
+    print('model load complete')
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    # 做兼容，如果目录不存在的话把这个参数去掉，不然会报错
+    if args.load_ckpt_path is not None and \
+            not os.path.exists(args.load_ckpt_path):
+        print('--------warning no checkpoint found--------, remove args')
+        args.load_ckpt_path = None
+
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    trainer.fit(model, data_module, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen_base.sh b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen_base.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d3368c20dc1d5d287bef0619e341b35cc6228362
--- /dev/null
+++ b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen_base.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_bart # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+ROOT_DIR=../../workspace
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=erlangshen-bert-base
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=32
+
+# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin
+CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json"
+ZERO_STAGE=1
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $CONFIG_JSON
+{
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "gradient_clipping": 2,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON
+### End
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --datasets_name IDEA-CCNL/PretrainCorpusDemo \
+        "
+# 如果你有一批数据，可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理，通过参数传入
+# --train_file train.json
+# --val_file val.json
+# --test_file test.json
+
+MODEL_ARGS="\
+        --model_path $MODEL_ROOT_DIR/pretrain \
+        --learning_rate 1e-4 \
+        --weight_decay 1e-1 \
+        --warmup_ratio 0.01 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --save_last \
+        --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+        --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \
+        "
+
+TRAINER_ARGS="\
+        --max_epoch 1 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --log_every_n_steps 1 \
+        --precision 16 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        "
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+python3 pretrain_erlangshen.py $options
diff --git a/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta.py b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6bd2f81781c5bfcdd55aa1514104f8dec5d8f50
--- /dev/null
+++ b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta.py
@@ -0,0 +1,227 @@
+from dataclasses import dataclass
+from transformers import (
+    DebertaV2Config,
+    DebertaV2ForMaskedLM,
+    AutoTokenizer,
+)
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+import argparse
+import torch
+import os
+import numpy as np
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.data.data_utils.truncate_utils import truncate_segments
+from fengshen.data.data_utils.token_type_utils import create_tokens_and_tokentypes
+from fengshen.data.data_utils.mask_utils import create_masked_lm_predictions
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from torch.utils.data._utils.collate import default_collate
+
+SHOW_DATA = False
+
+
+@dataclass
+class DeBERTaV2Collator:
+    '''
+    由input处理成samples，也就是最终模型的输入
+    其中主要处理逻辑在__call__里
+    包含Mask任务，使用Whole Word Mask
+    '''
+    tokenizer: None  # 分词
+    max_seq_length: 512
+    masked_lm_prob: 0.15
+    content_key: str = 'text'
+    # 一些预处理操作
+
+    def setup(self):
+        self.np_rng = np.random.RandomState(seed=42)
+        inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()}
+        self.vocab_id_list = list(inv_vocab.keys())
+        self.vocab_id_to_token_dict = inv_vocab
+        import jieba_fast
+        self.zh_tokenizer = jieba_fast.lcut
+
+    def __call__(self, samples):
+        '''
+        samples: 一个sample长这样{"text": "hello world"}
+        '''
+        model_inputs = []
+        for s in samples:
+            tokenized_sentences = self.tokenizer.convert_tokens_to_ids(
+                self.tokenizer.tokenize(s[self.content_key]))
+            if len(tokenized_sentences) == 0:
+                print('find empty sentence')
+                continue
+            tokens_a = tokenized_sentences
+            # max_seq_length - 3因为还需要拼上[CLS] [SEP] [SEP]
+            if len(tokens_a) == 0:
+                continue
+            _ = truncate_segments(tokens_a, [], len(tokens_a),
+                                  0, self.max_seq_length-3, self.np_rng)
+            # Build tokens and toketypes.
+            tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, [],
+                                                              self.tokenizer.cls_token_id, self.tokenizer.sep_token_id)
+            # Masking.
+            max_predictions_per_seq = self.masked_lm_prob * len(tokens)
+            (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions(
+                tokens, self.vocab_id_list, self.vocab_id_to_token_dict, self.masked_lm_prob,
+                self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id,
+                max_predictions_per_seq, self.np_rng,
+                masking_style='bert',
+                zh_tokenizer=self.zh_tokenizer)
+
+            # Some checks.
+            num_tokens = len(tokens)
+            padding_length = self.max_seq_length - num_tokens
+            assert padding_length >= 0
+            assert len(tokentypes) == num_tokens
+            assert len(masked_positions) == len(masked_labels)
+
+            # Tokens and token types.
+            filler = [self.tokenizer.pad_token_id] * padding_length
+            tokens_np = np.array(tokens + filler, dtype=np.int64)
+            tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+            # Padding mask.
+            padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                                       dtype=np.int64)
+
+            # Lables and loss mask.
+            labels = [-100] * self.max_seq_length
+            for i in range(len(masked_positions)):
+                assert masked_positions[i] < num_tokens
+                labels[masked_positions[i]] = masked_labels[i]
+            labels_np = np.array(labels, dtype=np.int64)
+            model_inputs.append(
+                {
+                    'input_ids': tokens_np,
+                    'attention_mask': padding_mask_np,
+                    'token_type_ids': tokentypes_np,
+                    'labels': labels_np,
+                }
+            )
+        return default_collate(model_inputs)
+
+
+class ErlangshenDeBERTaV2(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Erlangshen Bert')
+        parser.add_argument('--masked_lm_prob', type=float, default=0.15)
+        parser.add_argument('--max_seq_length', type=int, default=512)
+        parser.add_argument('--sample_content_key', type=str, default='text')
+        return parent_parser
+
+    def __init__(self, args, tokenizer, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+        config = DebertaV2Config.from_pretrained(args.model_path)
+        self.config = config
+        self.tokenizer = tokenizer
+        self.model = DebertaV2ForMaskedLM(config)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def forward(self, **batch):
+        return self.model(**batch)
+
+    def detokenize(self, token_ids):
+        toks = self.tokenizer.convert_ids_to_tokens(token_ids)
+        return self.tokenizer.convert_tokens_to_string(toks)
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.shape[0]
+        return acc
+
+    def training_step(self, batch, batch_idx):
+        if self.trainer.global_rank == 0:
+            global SHOW_DATA
+            if not SHOW_DATA:
+                print(self.config)
+                print(self.model)
+                SHOW_DATA = True
+                print('source: {}'.format(batch['input_ids'][0]))
+                print('target: {}'.format(batch['labels'][0]))
+                print('source: {}'.format(self.detokenize(batch['input_ids'][0])))
+                label_idx = batch['labels'][0] != -100
+                print('target: {}'.format(self.detokenize(
+                    batch['labels'][0][label_idx])))
+        output = self(**batch)
+        self.log('train_loss', output.loss, sync_dist=True)
+        label_idx = batch['labels'] != -100
+        acc = self.comput_metrix(
+            output.logits[label_idx].view(-1, output.logits.size(-1)), batch['labels'][label_idx])
+        self.log('train_acc', acc, sync_dist=True)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        output = self(**batch)
+        self.log('val_loss', output.loss, sync_dist=True)
+        return output.loss
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = ErlangshenDeBERTaV2.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    collate_fn = DeBERTaV2Collator(
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        masked_lm_prob=args.masked_lm_prob,
+        content_key=args.sample_content_key,
+    )
+    collate_fn.setup()
+    data_module = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collate_fn)
+    print('data load complete')
+
+    model = ErlangshenDeBERTaV2(args, tokenizer=tokenizer)
+    print('model load complete')
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    # 做兼容，如果目录不存在的话把这个参数去掉，不然会报错
+    if args.load_ckpt_path is not None and \
+            not os.path.exists(args.load_ckpt_path):
+        print('--------warning no checkpoint found--------, remove args')
+        args.load_ckpt_path = None
+
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    trainer.fit(model, data_module, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta_base.sh b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta_base.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bf6ad5cb30f14173854aa66bf91d731151ec47d7
--- /dev/null
+++ b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta_base.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_bart # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+ROOT_DIR=../../workspace
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=erlangshen-deberta-base
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=32
+
+# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin
+CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json"
+ZERO_STAGE=1
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $CONFIG_JSON
+{
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "gradient_clipping": 1,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON
+### End
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --datasets_name IDEA-CCNL/PretrainCorpusDemo \
+        "
+# 如果你有一批数据，可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理，通过参数传入
+# --train_file train.json
+# --val_file val.json
+# --test_file test.json
+
+MODEL_ARGS="\
+        --model_path $MODEL_ROOT_DIR/pretrain \
+        --learning_rate 1e-4 \
+        --weight_decay 1e-1 \
+        --warmup_ratio 0.01 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --save_last \
+        --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+        --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \
+        "
+
+TRAINER_ARGS="\
+        --max_epoch 10 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --log_every_n_steps 1 \
+        --precision 16 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        "
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+python3 pretrain_deberta.py $options
+#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain_deberta.py $options
diff --git a/fengshen/examples/pretrain_randeng_bart/pretrain_bart.py b/fengshen/examples/pretrain_randeng_bart/pretrain_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c779de17c7b990b05e0e189cc1c486b8678115
--- /dev/null
+++ b/fengshen/examples/pretrain_randeng_bart/pretrain_bart.py
@@ -0,0 +1,281 @@
+from transformers import AutoTokenizer, BartForConditionalGeneration, BartConfig
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.callbacks import LearningRateMonitor
+from dataclasses import dataclass
+import os
+import argparse
+import torch
+import math
+import time
+from torch.utils.data._utils.collate import default_collate
+from fengshen.data.data_utils.mask_utils import create_masked_lm_predictions
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.utils import UniversalCheckpoint
+from fengshen.models.model_utils import (
+    get_total_steps,
+    configure_optimizers,
+    add_module_args,
+)
+import numpy as np
+SHOW_DATA = False
+
+
+@ dataclass
+class BartCollator:
+    '''
+    由input处理成samples，也就是最终模型的输入
+    其中主要处理逻辑在__call__里
+    包含text infilling和sentence shuffle任务
+    '''
+    tokenizer: None  # 分词
+    max_seq_length: 512
+    masked_lm_prob: 0.15
+    permute_sentence_ratio: 1.0
+    content_key: str = 'text'
+
+    def setup(self):
+        from fengshen.data.data_utils.sentence_split import ChineseSentenceSplitter
+        self.sentence_split = ChineseSentenceSplitter()
+        self.np_rng = np.random.RandomState(seed=((int(time.time()) % 2**32)))
+        inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()}
+        self.vocab_id_list = list(inv_vocab.keys())
+        self.vocab_id_to_token_dict = inv_vocab
+        import jieba_fast
+        self.zh_tokenizer = jieba_fast.lcut
+        seg_tokens = ['。', ';', '；', '!', '！', '?', '？']
+        seg_token_ids = []
+        for t in seg_tokens:
+            if t in self.tokenizer.vocab:
+                seg_token_ids.append(self.tokenizer.vocab[t])
+            else:
+                print('seg_token "{}" not in vocab'.format(t))
+        self.seg_token_ids = set(seg_token_ids)
+
+    def permute_sentences(self, source, full_stops, p=1.0):
+        # Tokens that are full stops, where the previous token is not
+        sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2
+        result = source.clone()
+
+        num_sentences = sentence_ends.size(0)
+        num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
+        substitutions = torch.randperm(num_sentences)[:num_to_permute]
+        ordering = torch.arange(0, num_sentences)
+        ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]
+
+        # Ignore <bos> at start
+        index = 1
+        for i in ordering:
+            sentence = source[(sentence_ends[i - 1] if i > 0 else 1): sentence_ends[i]]
+            result[index: index + sentence.size(0)] = sentence
+            index += sentence.size(0)
+        return result
+
+    def __call__(self, samples):
+        '''
+        samples: 一个sample长这样{"text": "hello world"}
+        '''
+        model_inputs = []
+        for s in samples:
+            sentences = self.sentence_split.tokenize(s[self.content_key])
+            tokenized_sentences = [self.tokenizer.convert_tokens_to_ids(
+                self.tokenizer.tokenize(sent)) for sent in sentences]
+            if len(tokenized_sentences) == 0:
+                print('find empty sentence')
+                continue
+
+            tokens = [self.tokenizer.cls_token_id]
+            for sent in tokenized_sentences:
+                for t in sent:
+                    tokens.append(t)
+            if tokens[-1] != self.tokenizer.sep_token_id:
+                tokens.append(self.tokenizer.sep_token_id)
+
+            if len(tokens) > self.max_seq_length:
+                # 找到最后的一句话，如果有的话，尽量保证最后一句话的完整
+                last_pos = self.max_seq_length - 1
+                for i in range(self.max_seq_length - 1, 0, -1):
+                    if tokens[i-1] in self.seg_token_ids:
+                        last_pos = i
+                        break
+                tokens = tokens[:last_pos]
+
+                tokens.append(self.tokenizer.sep_token_id)
+            tokens = torch.LongTensor(tokens)
+
+            full_stops = torch.any(torch.stack([torch.eq(tokens, aelem).logical_or_(
+                torch.eq(tokens, aelem)) for aelem in self.seg_token_ids], dim=0), dim=0)
+
+            assert (self.max_seq_length -
+                    tokens.shape[0]) >= 0, (tokens.size(), tokens[-1], self.max_seq_length)
+
+            source, target = tokens, tokens.clone()
+
+            if self.permute_sentence_ratio > 0.0:
+                source = self.permute_sentences(source, full_stops, self.permute_sentence_ratio)
+
+            if self.masked_lm_prob > 0.0:
+                mask_prob = self.masked_lm_prob * 2
+                max_predictions_per_seq = mask_prob * len(source)
+                (source, _, _, _, _) = create_masked_lm_predictions(
+                    source.numpy(), self.vocab_id_list, self.vocab_id_to_token_dict, mask_prob,
+                    self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id,
+                    max_predictions_per_seq, self.np_rng,
+                    masking_style='bert', zh_tokenizer=self.zh_tokenizer)
+                # 合并[MASK] 因为这里用的是Bert的mask函数，Bert是按字mask的，
+                # 这里把连续的mask合并成一个MASK从而达到span mask的效果
+                span_mask_souce = []
+                for t in source:
+                    # 如果是连续的多个mask，则跳过
+                    if len(span_mask_souce) > 0 \
+                            and t is self.tokenizer.mask_token_id \
+                            and span_mask_souce[-1] is self.tokenizer.mask_token_id:
+                        continue
+                    span_mask_souce.append(t)
+
+                source = torch.LongTensor(span_mask_souce)
+
+            assert (source >= 0).all()
+            # assert (source[1:-1] >= 1).all(), source
+            assert (source <= self.tokenizer.vocab_size).all()
+            assert source[0] == self.tokenizer.cls_token_id
+            assert source[-1] == self.tokenizer.sep_token_id
+
+            prev_output_tokens = torch.zeros_like(target)
+            # match the preprocessing in fairseq
+            prev_output_tokens[0] = self.tokenizer.sep_token_id
+            prev_output_tokens[1:] = target[:-1]
+
+            source_ = torch.full((self.max_seq_length,),
+                                 self.tokenizer.pad_token_id, dtype=torch.long)
+            source_[:source.shape[0]] = source
+            target_ = torch.full((self.max_seq_length,), -100, dtype=torch.long)
+            target_[:target.shape[0]] = target
+            prev_output_tokens_ = torch.full(
+                (self.max_seq_length,), self.tokenizer.pad_token_id, dtype=torch.long)
+            prev_output_tokens_[:prev_output_tokens.shape[0]] = prev_output_tokens
+            attention_mask = torch.full((self.max_seq_length,), 0, dtype=torch.long)
+            attention_mask[:source.shape[0]] = 1
+            model_inputs.append({
+                "input_ids": source_,
+                "labels": target_,
+                "decoder_input_ids": prev_output_tokens_,
+                "attention_mask": attention_mask,
+            })
+        return default_collate(model_inputs)
+
+
+class RandengBart(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Randeng BART')
+        parser.add_argument('--masked_lm_prob', type=float, default=0.15)
+        parser.add_argument('--max_seq_length', type=int, default=512)
+        parser.add_argument('--sample_content_key', type=str, default='text')
+        parser.add_argument('--permute_sentence_ratio', type=str, default=1.0)
+        return parent_parser
+
+    def __init__(self, args, tokenizer, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+        config = BartConfig.from_pretrained(args.model_path)
+        self.model = BartForConditionalGeneration(config)
+        self.tokenizer = tokenizer
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def detokenize(self, token_ids):
+        toks = self.tokenizer.convert_ids_to_tokens(token_ids)
+        return self.tokenizer.convert_tokens_to_string(toks)
+
+    def training_step(self, batch, batch_idx):
+        if self.trainer.global_rank == 0:
+            global SHOW_DATA
+            if not SHOW_DATA:
+                SHOW_DATA = True
+                print('source: {}'.format(batch['input_ids'][0]))
+                print('target: {}'.format(batch['labels'][0]))
+                print('decoder source: {}'.format(batch['decoder_input_ids'][0]))
+
+                print('source: {}'.format(self.detokenize(batch['input_ids'][0])))
+                print('decoder source: {}'.format(self.detokenize(batch['decoder_input_ids'][0])))
+                label_idx = batch['labels'][0] != -100
+                print('target: {}'.format(self.detokenize(
+                    batch['labels'][0][label_idx])))
+        output = self.model(**batch)
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('train_loss', output.loss, sync_dist=True)
+        self.log('train_acc', acc, sync_dist=True)
+        return output.loss
+
+    def comput_metrix(self, logits, labels):
+        label_idx = labels != -100
+        labels = labels[label_idx]
+        logits = logits[label_idx].view(-1, logits.size(-1))
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.shape[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss, sync_dist=True)
+        self.log('val_acc', acc, sync_dist=True)
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = RandengBart.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+
+    collator = BartCollator(
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        masked_lm_prob=args.masked_lm_prob,
+        content_key=args.sample_content_key,
+        permute_sentence_ratio=args.permute_sentence_ratio,
+    )
+    # 准备一些额外参数
+    collator.setup()
+    data_module = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collator)
+
+    module = RandengBart(args, tokenizer=tokenizer)
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    # 做兼容，如果目录不存在的话把这个参数去掉，不然会报错
+    if args.load_ckpt_path is not None and \
+            not os.path.exists(args.load_ckpt_path):
+        print('--------warning no checkpoint found--------, remove args')
+        args.load_ckpt_path = None
+
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    trainer.fit(module, data_module, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/pretrain_randeng_bart/pretrain_bart_base.sh b/fengshen/examples/pretrain_randeng_bart/pretrain_bart_base.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2ac4d8d40a2135c7439c150d7b208f94ba002a0d
--- /dev/null
+++ b/fengshen/examples/pretrain_randeng_bart/pretrain_bart_base.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_bart # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+ROOT_DIR=../../workspace
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=randeng-bart-base
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=32
+
+# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin
+CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json"
+ZERO_STAGE=1
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $CONFIG_JSON
+{
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "gradient_clipping": 1,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON
+### End
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        "
+# 如果你有一批数据，可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理，通过参数传入
+# --train_file train.json
+# --val_file val.json
+# --test_file test.json
+
+MODEL_ARGS="\
+        --model_path $MODEL_ROOT_DIR/pretrain \
+        --learning_rate 1e-4 \
+        --weight_decay 1e-1 \
+        --warmup_ratio 0.01 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --save_last \
+        --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+        --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \
+        "
+
+TRAINER_ARGS="\
+        --max_epoch 10 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --log_every_n_steps 1 \
+        --precision 16 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        "
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+
+python3 pretrain_bart.py $options
+#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain_bart.py $options
diff --git a/fengshen/examples/pretrain_t5/convert_ckpt_randeng_t5_char.sh b/fengshen/examples/pretrain_t5/convert_ckpt_randeng_t5_char.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5c446fd8784477d1caa1519b614d759aa3cb6ec8
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/convert_ckpt_randeng_t5_char.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -x -e
+
+echo "START TIME: $(date)"
+BIN_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/randeng_t5_char_57M
+if [ ! -d ${BIN_DIR} ];then
+  mkdir ${BIN_DIR}
+  echo ${BIN_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${BIN_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+
+MODEL_ARGS="
+    --ckpt_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/ckpt/last.ckpt/checkpoint/mp_rank_00_model_states.pt \
+    --bin_path ${BIN_DIR}/pytorch_model.bin \
+    --rm_prefix module.model. \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $MODEL_ARGS \
+    "
+
+echo $CMD
+/home/ganruyi/anaconda3/bin/python $CMD
diff --git a/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py b/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aeef8c860864d138b0c970baca72a568bf51a19
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py
@@ -0,0 +1,37 @@
+import time
+from builtins import print
+import argparse
+
+import torch
+# os.environ["CUDA_VISIBLE_DEVICES"] = '3'
+
+
+def get_time_str():
+    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Pretrain Unsupervise.")
+    total_parser.add_argument('--ckpt_path', default=None, type=str)
+    total_parser.add_argument('--bin_path', default=None, type=str)
+    total_parser.add_argument('--rm_prefix', default=None, type=str)
+    # * Args for base model
+    args = total_parser.parse_args()
+    print('Argument parse success.')
+    state_dict = torch.load(args.ckpt_path)['module']
+    new_state_dict = {}
+
+    if args.rm_prefix is not None:
+        prefix_len = len(args.rm_prefix)
+        for k, v in state_dict.items():
+            if k[:prefix_len] == args.rm_prefix:
+                new_state_dict[k[prefix_len:]] = v
+            else:
+                new_state_dict[k] = v
+    else:
+        new_state_dict = state_dict
+    torch.save(new_state_dict, args.bin_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/pretrain_t5/finetune_t5.py b/fengshen/examples/pretrain_t5/finetune_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..497b1ca26817d2c1dbf8d1be4b5cea51ad846f4e
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/finetune_t5.py
@@ -0,0 +1,144 @@
+import time
+from builtins import print
+import sys
+import os
+import torch
+import argparse
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer, loggers
+from transformers import MT5ForConditionalGeneration
+from pytorch_lightning.callbacks import LearningRateMonitor
+# os.environ["CUDA_VISIBLE_DEVICES"] = '3'
+
+
+class MT5FinetuneModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--keep_tokens_path', default=None, type=str)
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.model = MT5ForConditionalGeneration.from_pretrained(
+            args.pretrained_model_path
+        )
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            labels=batch['labels'])
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('train_loss', output.loss, sync_dist=True)
+        self.log('train_acc', acc, sync_dist=True)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        # print('is out of index: ', batch['input_ids'][batch['input_ids'] >= 32598])
+        output = self.model(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            labels=batch['labels'])
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        cond_output = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            force_words_ids=batch['force_words_ids'],
+            num_beams=2,
+        )
+        cond_acc = self.comput_metrix(cond_output, batch['labels'])
+        self.log('val_loss', output.loss, sync_dist=True)
+        self.log('val_acc', acc, sync_dist=True)
+        self.log('cond_acc', cond_acc, sync_dist=True)
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/y_true.shape[0]
+        return acc
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # Save the current loop info in the mid of epoch
+        # if you lightning <= 1.6.0  uncomment the line below
+        # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict()
+        if self.trainer.global_rank == 0 and self.trainer.global_step % self.hparams.every_n_train_steps == 0:
+            self.model.save_pretrained(os.path.join(
+                self.trainer.checkpoint_callback.dirpath,
+                'hf_pretrained_epoch{}_step{}'.format(self.trainer.current_epoch, self.trainer.global_step)))
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+def get_time_str():
+    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Pretrain Unsupervise.")
+    total_parser.add_argument(
+        '--do_eval_only', action='store_true', default=False)
+    total_parser.add_argument(
+        '--pretrained_model_path', default=None, type=str)
+    total_parser.add_argument(
+        '--new_vocab_path', default=None, type=str)
+    total_parser.add_argument('--max_seq_length', default=1024, type=int)
+    total_parser.add_argument('--ckpt_path', default=None, type=str)
+    sys.path.append('../../../')
+    from fengshen.data.t5_dataloader.t5_datasets import TaskT5DataModel
+    from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+    # * Args for data preprocessing
+    total_parser = TaskT5DataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+    total_parser = MT5FinetuneModel.add_model_specific_args(total_parser)
+    # * Args for base model
+    args = total_parser.parse_args()
+    print('Argument parse success.')
+    print('TaskT5DataModel load start {}'.format(get_time_str()))
+    data_model = TaskT5DataModel(args)
+    print('TaskT5DataModel load end {}'.format(get_time_str()))
+    if not args.do_eval_only:
+        model = MT5FinetuneModel(args)
+        checkpoint_callback = UniversalCheckpoint(args)
+        lr_monitor = LearningRateMonitor(logging_interval='step')
+        logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+            args.default_root_dir, 'logs/'))
+        trainer = Trainer.from_argparse_args(args,
+                                             logger=logger,
+                                             callbacks=[checkpoint_callback, lr_monitor]
+                                             )
+        trainer.fit(model, data_model, ckpt_path=args.ckpt_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/pretrain_t5/finetune_unimc_randeng_t5_char_57M.sh b/fengshen/examples/pretrain_t5/finetune_unimc_randeng_t5_char_57M.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fccf833bdc954707bdc94d6bef3821239006a2c6
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/finetune_unimc_randeng_t5_char_57M.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+#SBATCH --job-name=finetune_unimc_randeng_t5_char_57M
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=64
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/finetune_unimc_randeng_t5_char_57M/
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.finetune_unimc_randeng_t5_char_57M.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+export CUDA_VISIBLE_DEVICES='6'
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 240000,
+      "warmup_num_steps" : 10000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 100000 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.1 \
+    --dataset_num_workers 4 \
+    --dataloader_num_workers 4 \
+    --replace_sampler_ddp False \
+"
+# --accumulate_grad_batches 8 \
+TRAIN_DATA_DIR=/cognitive_comp/yangping/data/unidata/multiplechoice/pretraining_alldata/alldata/train.json
+VALID_DATA_DIR=/cognitive_comp/yangping/data/unidata/multiplechoice/pretraining_alldata/alldata/dev.json
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data_path ${TRAIN_DATA_DIR} \
+    --valid_data_path ${TRAIN_DATA_DIR} \
+    --max_seq_length 512 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/randeng_t5_char_57M \
+    --tokenizer_type bert_tokenizer \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/finetune_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+/home/ganruyi/anaconda3/bin/python $CMD
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+# source activate base
+# python $CMD
+# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD
+
diff --git a/fengshen/examples/pretrain_t5/pretrain_mt5_small.sh b/fengshen/examples/pretrain_t5/pretrain_mt5_small.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e9d49e3a83d9a886890740179a9ae3739a58654
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_mt5_small.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#SBATCH --job-name=randeng_t5_77M
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o %x-%j.log
+#SBATCH -e %x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=64
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_77M/
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.t5_cn_small_pretrain.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 100000,
+      "warmup_num_steps" : 10000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 8 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 50000 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.01 \
+    --preprocessing_num_workers 20 \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_t5_tokenized_512
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data ${DATA_DIR} \
+    --train_split_size 0.999 \
+    --max_seq_length 512 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-small \
+    --new_vocab_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model \
+    --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \
+"
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+# source activate base
+# python $CMD
+# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD
+
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+srun --jobid=171866 --job-name=randeng_t5_77M --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 -e %x-%j.err -o %x-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+# salloc --nodes=1 --gres=gpu:2 --cpus-per-gpu=20 -t 24:00:00
+# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python -u -m debugpy --listen 192.168.190.2:53005 --wait-for-client $CMD'
\ No newline at end of file
diff --git a/fengshen/examples/pretrain_t5/pretrain_mt5_small_continue.sh b/fengshen/examples/pretrain_t5/pretrain_mt5_small_continue.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a539a7e6a7fb4b750b441df98dd49f166c3c49b
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_mt5_small_continue.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name=t5_cn_small_pretrain_v2
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o %x-%j.log
+#SBATCH -e %x-%j.err
+#SBATCH -x dgx050
+
+set -x -e
+source activate base
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=32
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/t5_cn_small_pretrain_v2/
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.t5_cn_small_pretrain_v2.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+
+cat <<EOT > $config_json
+{
+    "zero_optimization": {
+        "stage": 1
+    },
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "params": {
+            "betas": [
+                0.9,
+                0.95
+            ],
+            "eps": 1e-08,
+            "lr": 1e-04,
+            "weight_decay": 0.01
+        },
+        "type": "AdamW"
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params":{
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 1e-4,
+            "warmup_num_steps": 10000
+        }
+    },
+    "steps_per_print": 100,
+    "gradient_clipping": 1,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+    "zero_allow_untested_optimizer": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 8 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 0 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.01 \
+    --preprocessing_num_workers 20 \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_mt5_tokenized
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data ${DATA_DIR} \
+    --train_split_size 0.999 \
+    --max_seq_length 1024 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/t5_cn_small_pretrain/Randeng-T5-77M \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \
+"
+# --resume_from_checkpoint /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/ckpt/last.ckpt \
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+# salloc --nodes=1 --gres=gpu:2 --cpus-per-gpu=20 -t 24:00:00
+clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+# clear; srun --job-name=t5_cn_small_pretrain_v2 --jobid=153124 --nodes=1 --ntasks-per-node=8 --gres=gpu:8 --cpus-per-task=30 -o %x-%j.log -e %x-%j.err singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
diff --git a/fengshen/examples/pretrain_t5/pretrain_mt5_small_predict.sh b/fengshen/examples/pretrain_t5/pretrain_mt5_small_predict.sh
new file mode 100644
index 0000000000000000000000000000000000000000..be643bb12ddf613e99a5f6ac3bd23f3ab0773a33
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_mt5_small_predict.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=t5_cn_small_pretrain
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=128
+ROOT_DIR=/cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/
+
+ZERO_STAGE=2
+
+config_json="$ROOT_DIR/ds_config.t5_cn_small_pretrain.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 128,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-4,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 1e-4,
+      "warmup_num_steps": 10000
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_2
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 10 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.01 \
+    --accumulate_grad_batches 8 \
+    --resume_from_checkpoint /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/old-ckpt/last.ckpt \
+    --do_eval_only \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_mt5_tokenized
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data wudao_180g_mt5_tokenized\
+    --train_split_size 0.999 \
+    --max_seq_length 1024 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-small \
+    --new_vocab_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/fengshen/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+/home/ganruyi/anaconda3/bin/python $CMD
\ No newline at end of file
diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_10B.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_10B.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6b85b4886dffc191c6d4856f66c2b3fd51817f69
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_10B.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_randeng_t5_char_10B
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=1
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=2
+
+config_json="$ROOT_DIR/ds_config.randeng_t5_char_10B.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+export CUDA_VISIBLE_DEVICES='1,2,3,4'
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "cpu_offload": true,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 100000,
+      "warmup_num_steps" : 10000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_${ZERO_STAGE}
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 4 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 1000000 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.1 \
+    --dataset_num_workers 4 \
+    --dataloader_num_workers 4 \
+    --replace_sampler_ddp False \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_bert_tokenized_512
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data_path ${DATA_DIR} \
+    --train_split_size 0.999 \
+    --max_seq_length 512 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/randeng_t5_char_10B \
+    --tokenizer_type bert_tokenizer \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+/home/ganruyi/anaconda3/bin/python $CMD
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+# source activate base
+# python $CMD
+# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD
+
diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_57M.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_57M.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8e86e8b077019a57c5a6ac28ab29749f1a2787aa
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_57M.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_randeng_t5_char_57M
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=64
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.randeng_t5_char_57M.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+# export CUDA_VISIBLE_DEVICES='4,5'
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 240000,
+      "warmup_num_steps" : 10000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 8 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 100000 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.1 \
+    --dataset_num_workers 4 \
+    --dataloader_num_workers 4 \
+    --replace_sampler_ddp False \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_bert_tokenized_512
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data_path ${DATA_DIR} \
+    --train_split_size 0.999 \
+    --max_seq_length 512 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/randeng_t5_char_57M \
+    --tokenizer_type bert_tokenizer \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+/home/ganruyi/anaconda3/bin/python $CMD
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+# source activate base
+# python $CMD
+# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD
+
diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_700M.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_700M.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5b3b2c6c87831ebce78d4f7e0ed133b7a8468ba2
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_700M.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+#SBATCH --job-name=pretrain_randeng_t5_char_700M
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=8
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.randeng_t5_char_700M.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+# export CUDA_VISIBLE_DEVICES='2,5'
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 400000,
+      "warmup_num_steps" : 10000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 8 \
+    --num_nodes 2 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 100000 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.1 \
+    --dataset_num_workers 4 \
+    --dataloader_num_workers 4 \
+    --replace_sampler_ddp False \
+    --accumulate_grad_batches 2 \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_bert_tokenized_512
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data_path ${DATA_DIR} \
+    --train_split_size 0.999 \
+    --max_seq_length 512 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/randeng_t5_char_700M \
+    --tokenizer_type bert_tokenizer \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+# /home/ganruyi/anaconda3/bin/python $CMD
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+# source activate base
+# python $CMD
+# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD
+
diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_large.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_large.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a91d7082a4c945fe78a2fb0ce99be7c7d9a02745
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_large.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH --job-name=randeng_t5_large
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o %x-%j.log
+#SBATCH -e %x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=8
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_large_v2/
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.randeng_t5_large_pretrain.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 100000,
+      "warmup_num_steps" : 10000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 8 \
+    --num_nodes 2 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --every_n_train_steps 1000000 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --val_check_interval 0.01 \
+    --preprocessing_num_workers 20 \
+"
+# --accumulate_grad_batches 8 \
+DATA_DIR=wudao_180g_t5_tokenized_512
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data ${DATA_DIR} \
+    --train_split_size 0.999 \
+    --max_seq_length 512 \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-large \
+    --new_vocab_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model \
+    --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \
+"
+# --ckpt_path /cognitive_comp/ganruyi/experiments/randeng_t5_large/ckpt/last.ckpt \
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+# source activate base
+# python $CMD
+# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD
+
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+srun --jobid=172781 --job-name=randeng_t5_large --nodes=2 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 -e randeng_t5_large-%j.err -o randeng_t5_large-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+# salloc --nodes=1 --gres=gpu:2 --cpus-per-gpu=20 -t 24:00:00
+# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python -u -m debugpy --listen 192.168.190.2:53005 --wait-for-client $CMD'
\ No newline at end of file
diff --git a/fengshen/examples/pretrain_t5/pretrain_t5.py b/fengshen/examples/pretrain_t5/pretrain_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a95bc8781ca5f4e0fa3ef0cb1eea98e5d4abbe6
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/pretrain_t5.py
@@ -0,0 +1,175 @@
+import time
+from builtins import print
+import sys
+import os
+import torch
+import argparse
+import json
+import pytorch_lightning as pl
+from transformers import MT5Config, MT5Tokenizer
+from pytorch_lightning import Trainer, loggers
+from transformers import MT5ForConditionalGeneration
+from pytorch_lightning.callbacks import LearningRateMonitor
+# os.environ["CUDA_VISIBLE_DEVICES"] = '3'
+
+
+class MT5PretrainModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--keep_tokens_path', default=None, type=str)
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        if args.tokenizer_type == 't5_tokenizer':
+            if args.new_vocab_path is not None:
+                # 用于从mt5继续训练，此时只保留中英文词表，spm采用新模型
+                assert args.keep_tokens_path is not None
+                keep_tokens = json.load(open(args.keep_tokens_path))
+                self.model = MT5ForConditionalGeneration.from_pretrained(
+                    args.pretrained_model_path)
+                new_config = self.model.config
+                new_config.vocab_size = len(keep_tokens)
+                print('vocab_size:', new_config.vocab_size)
+
+                new_state_dict = self.model.state_dict()
+                select_index = torch.tensor(keep_tokens)
+                new_state_dict['encoder.embed_tokens.weight'] = torch.index_select(
+                    new_state_dict['encoder.embed_tokens.weight'], dim=0, index=select_index)
+                new_state_dict['shared.weight'] = torch.index_select(
+                    new_state_dict['shared.weight'], dim=0, index=select_index)
+                new_state_dict['decoder.embed_tokens.weight'] = torch.index_select(
+                    new_state_dict['decoder.embed_tokens.weight'], dim=0, index=select_index)
+                new_state_dict['lm_head.weight'] = torch.index_select(
+                    new_state_dict['lm_head.weight'], dim=0, index=select_index)
+                self.model = MT5ForConditionalGeneration.from_pretrained(
+                    args.pretrained_model_path, config=new_config, state_dict=new_state_dict)
+                # self.model = MT5ForConditionalGeneration(config=new_config)
+            else:
+                # 用于继续训练
+                self.model = MT5ForConditionalGeneration.from_pretrained(
+                    args.pretrained_model_path
+                )
+        else:
+            self.model = MT5ForConditionalGeneration(
+                MT5Config.from_pretrained(args.pretrained_model_path)
+            )
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'], labels=batch['labels'])
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('train_loss', output.loss, sync_dist=True)
+        self.log('train_acc', acc, sync_dist=True)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        # print('is out of index: ', batch['input_ids'][batch['input_ids'] >= 32598])
+        output = self.model(
+            input_ids=batch['input_ids'], labels=batch['labels'])
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss, sync_dist=True)
+        self.log('val_acc', acc, sync_dist=True)
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/y_true.shape[0]
+        return acc
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # Save the current loop info in the mid of epoch
+        # if you lightning <= 1.6.0  uncomment the line below
+        # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict()
+        if self.trainer.global_rank == 0 and self.trainer.global_step % self.hparams.every_n_train_steps == 0:
+            self.model.save_pretrained(os.path.join(
+                self.trainer.checkpoint_callback.dirpath,
+                'hf_pretrained_epoch{}_step{}'.format(self.trainer.current_epoch, self.trainer.global_step)))
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+def get_time_str():
+    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Pretrain Unsupervise.")
+    total_parser.add_argument(
+        '--do_eval_only', action='store_true', default=False)
+    total_parser.add_argument(
+        '--pretrained_model_path', default=None, type=str)
+    total_parser.add_argument(
+        '--new_vocab_path', default=None, type=str)
+    total_parser.add_argument('--max_seq_length', default=1024, type=int)
+    total_parser.add_argument('--ckpt_path', default=None, type=str)
+    sys.path.append('../../../')
+    from fengshen.data.t5_dataloader.t5_datasets import UnsuperviseT5DataModel
+    from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+    # * Args for data preprocessing
+    total_parser = UnsuperviseT5DataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+    total_parser = MT5PretrainModel.add_model_specific_args(total_parser)
+    # * Args for base model
+    args = total_parser.parse_args()
+    print('Argument parse success.')
+    print('UnsuperviseT5DataModel load start {}'.format(get_time_str()))
+    data_model = UnsuperviseT5DataModel(args)
+    print('UnsuperviseT5DataModel load end {}'.format(get_time_str()))
+    if not args.do_eval_only:
+        model = MT5PretrainModel(args)
+        checkpoint_callback = UniversalCheckpoint(args)
+        lr_monitor = LearningRateMonitor(logging_interval='step')
+        logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+            args.default_root_dir, 'logs/'))
+        trainer = Trainer.from_argparse_args(args,
+                                             logger=logger,
+                                             callbacks=[checkpoint_callback, lr_monitor]
+                                             )
+        trainer.fit(model, data_model, ckpt_path=args.ckpt_path)
+    else:
+        tokenizer = MT5Tokenizer.from_pretrained(args.new_vocab_path, extra_ids=0)
+        model = MT5PretrainModel(args=args, num_data=len(data_model.predict_dataloader()))
+        trainer = Trainer.from_argparse_args(args)
+
+        result = trainer.predict(model, data_model)
+        result = result[0]
+        for i in range(4):
+            print(tokenizer.batch_decode(result['input_ids'][i]))
+            print(tokenizer.batch_decode(result['predict_ids'][i]))
+            print(tokenizer.batch_decode(result['labels'][i]))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/pretrain_t5/process_data.py b/fengshen/examples/pretrain_t5/process_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..bae164f107f7ec3569227f3e40a292ee1641fd21
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/process_data.py
@@ -0,0 +1,65 @@
+# coding=utf8
+import argparse
+import sys
+import os
+from concurrent.futures import ProcessPoolExecutor
+
+
+def _generate_cache_arrow(index, ds, path):
+    print('saving dataset shard {}'.format(index))
+    ds.save_to_disk(os.path.join(path, 'part_{}'.format(index)))
+    return 'saving dataset shard {} done'.format(index)
+
+
+def generate_arrow_cache(ds, args) -> None:
+    '''
+    读取wudao_180g等原数据或者tokenized之后的数据，并进行train test split
+    同时利用seed 42做shuffle 缓存下来
+    '''
+    ds = ds.train_test_split(train_size=args.train_split_size, seed=42)
+    print(ds)
+    p = ProcessPoolExecutor(max_workers=args.preprocessing_num_workers)
+    res = []
+    train_shard_part = args.saved_data_shards
+    for i in range(0, train_shard_part):
+        res.append(p.submit(_generate_cache_arrow, i,
+                            ds['train'].shard(train_shard_part, i), args.saved_train_data_path))
+
+    p.shutdown(wait=True)
+    for future in res:
+        print(future.result(), flush=True)
+
+    ds['test'].save_to_disk(args.saved_test_data_path)
+    print('done')
+
+
+if __name__ == '__main__':
+    total_parser = argparse.ArgumentParser("Save data Task")
+    total_parser.add_argument(
+        '--new_vocab_path', default='/cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model', type=str)
+    total_parser.add_argument('--preprocessing_num_workers', default=30, type=int)
+    total_parser.add_argument(
+        '--train_data_path', default='/cognitive_comp/common_data/test_wudao_180g_mt5_tokenized/', type=str)
+    total_parser.add_argument('--saved_data_shards', default=800, type=int)
+    total_parser.add_argument('--saved_train_data_path', default=None, type=str)
+    total_parser.add_argument('--saved_test_data_path', default=None, type=str)
+    total_parser.add_argument('--max_seq_length', default=512, type=int)
+    total_parser.add_argument('--train_split_size', default=0.999, type=float)
+    total_parser.add_argument('--pretrained_model_path', default=None, type=str)
+    total_parser.add_argument('--tokenizer_type', default='t5_tokenizer', choices=['t5_tokenizer', 'bert_tokenizer'])
+    total_parser.add_argument('--text_column_name', default='text')
+    total_parser.add_argument('--remove_columns', nargs='+', default=[])
+
+    # * Args for data preprocessing
+    args = total_parser.parse_args()
+    sys.path.append('../../../')
+    from fengshen.data.t5_dataloader.t5_datasets import UnsuperviseT5Dataset
+    ds = UnsuperviseT5Dataset(args.train_data_path, args)
+    print(ds)
+    generate_arrow_cache(ds.data, args=args)
+    # ds = UnsuperviseT5Dataset(args.train_data_path, args, load_data_type=0)
+    for i in range(0, 2):
+        print(ds.data[i])
+        print(ds.tokenizer.decode(ds.data[i]['input_ids']))
+
+    print(ds.data)
diff --git a/fengshen/examples/pretrain_t5/process_data_bert_tokenizer.sh b/fengshen/examples/pretrain_t5/process_data_bert_tokenizer.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b17187c6a26c0a5edf46cf2d9c5736338e6ff934
--- /dev/null
+++ b/fengshen/examples/pretrain_t5/process_data_bert_tokenizer.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH --job-name=process_data_bert_tokenizer
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1             # number of gpus
+#SBATCH --cpus-per-task=120 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_77M/%x-%j.log
+#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_77M/%x-%j.err
+set -x -e
+
+echo "START TIME: $(date)"
+
+DATA_ARGS="
+    --tokenizer_type bert_tokenizer \
+    --train_data_path wudao_180g \
+    --train_split_size 0.999 \
+    --max_seq_length 512 \
+    --preprocessing_num_workers 100 \
+    --saved_data_shards 800 \
+    --saved_train_data_path /cognitive_comp/common_data/wudao_180g_bert_tokenized_512_train/ \
+    --saved_test_data_path /cognitive_comp/common_data/wudao_180g_bert_tokenized_512_test/ \
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_77M/randeng_t5_char_77M \
+    --text_column_name text \
+    --remove_columns token_type_ids text \
+"
+
+    # --remove_columns text \
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/process_data.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+source activate base
+/home/ganruyi/anaconda3/bin/python $CMD
\ No newline at end of file
diff --git a/fengshen/examples/pretrain_taiyi_clip/flickr_datasets.py b/fengshen/examples/pretrain_taiyi_clip/flickr_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..530b74122b46f33bfa3de5cf536963f3538a9d40
--- /dev/null
+++ b/fengshen/examples/pretrain_taiyi_clip/flickr_datasets.py
@@ -0,0 +1,35 @@
+# 这里这个dataset只是临时测试用的，所以暂时用最简陋的方式放在这里，后续会优化
+from torch.utils.data import Dataset
+from PIL import Image
+
+
+class flickr30k_CNA(Dataset):
+    def __init__(self, img_root_path=None,
+                 text_annot_path=None,
+                 data_process_fn=None):
+        self.images = []
+        self.captions = []
+        self.labels = []
+        self.root = img_root_path
+        with open(text_annot_path, 'r') as f:
+            for line in f:
+                line = line.strip().split('\t')
+                key, caption = line[0].split('#')[0], line[1]
+                img_path = key + '.jpg'
+                self.images.append(img_path)
+                self.captions.append(caption)
+                self.labels.append(key)
+        self.data_process_fn = data_process_fn
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        img_path = str(self.root + "/" + self.images[idx])
+        instance_image = Image.open(img_path)
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        captions = self.captions[idx]
+        label = self.labels[idx]
+        image, text = self.data_process_fn(instance_image, captions)
+        return image, text, label
diff --git a/fengshen/examples/pretrain_taiyi_clip/pretrain.py b/fengshen/examples/pretrain_taiyi_clip/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e24ac370ff2b5f3ecf84a32586bc5205499b07
--- /dev/null
+++ b/fengshen/examples/pretrain_taiyi_clip/pretrain.py
@@ -0,0 +1,308 @@
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+from fengshen.models.clip import (
+    TaiyiCLIPModel,
+    TaiyiCLIPProcessor,
+)
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+import torch
+import torch.nn.functional as F
+import argparse
+import math
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import os
+import numpy as np
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+class Collator():
+    def __init__(self, args, processor):
+        self.processor = processor
+        self.seq_length = args.seq_length
+        self.transforms = Compose([
+            ToTensor(),
+            RandomResizedCrop(args.resolution, scale=(0.9, 1.0),
+                              interpolation=InterpolationMode.BICUBIC),
+            Normalize(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+        ])
+
+    def __call__(self, inputs):
+        max_length = min(self.seq_length, max([len(i['caption']) for i in inputs]))
+        images = []
+        texts = []
+        labels = []
+        for i in inputs:
+            # instance_image = Image.open(i['img_path'])
+            # instance_image = jpeg4py.JPEG(i['img_path']).decode()
+            instance_image = np.load(i['npy_path'])
+            images.append(self.transforms(instance_image))
+            texts.append(i['caption'])
+            labels.append(i['labels'] if 'labels' in i else -100)
+        # images_input = self.processor(images=images, return_tensors="pt")
+        texts_input = self.processor(text=texts,
+                                     max_length=max_length,
+                                     padding='max_length',
+                                     truncation=True,
+                                     return_tensors='pt')
+        # return images_input, texts_input, labels
+        return {'pixel_values': torch.stack(images)}, texts_input, labels
+
+
+class TaiyiCLIP(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Taiyi CLIP')
+        parser.add_argument('--loss_type', choices=['local', 'global'], default='local')
+        parser.add_argument('--seq_length', default=77)
+        parser.add_argument('--gather_with_grad', default=False, action='store_true')
+        parser.add_argument('--freeze_image_tower', default=False, action='store_true')
+        return parent_parser
+
+    def __init__(self, args, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+
+        self.model = TaiyiCLIPModel.from_pretrained(args.model_path)
+        self.processor = TaiyiCLIPProcessor.from_pretrained(args.model_path)
+
+        self.local_loss = args.loss_type == 'local'
+
+        if args.freeze_image_tower:
+            for param in self.model.vision_model.parameters():
+                param.requires_grad = False
+            self.model.visual_projection.requires_grad = False
+
+        # cache
+        self.cache_labels = True
+        self.prev_num_logits = 0
+        self.labels = {}
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+        elif stage == 'validate':
+            self.total_steps = 100
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def forward(self, image, text):
+        assert image is not None
+        assert text is not None
+        image_features = self.model.get_image_features(**image)
+        text_features = self.model.get_text_features(**text)
+
+        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+
+        return image_features, text_features, self.model.logit_scale.exp()
+
+    def gather_features(self, features):
+        if self.trainer.world_size == 1:
+            return features
+        all_features = self.all_gather(
+            features, sync_grads=self.hparams.gather_with_grad)
+        if not self.local_loss and not self.gather_with_grad:
+            # 如果是全局loss，并且不需要梯度，需要把梯度更新回tensor
+            all_features[self.global_rank] = features
+        all_features = all_features.view(-1, all_features.shape[-1])
+        return all_features
+
+    def clip_loss(self, image_features, text_features, logit_scale):
+
+        logits_per_image = None
+
+        # 如果我冻住VIT并且是local_loss，那么我只需要自己的这部分text feature就行
+        # 因为根本不需要image2text的feature训练VIT
+        if self.hparams.freeze_image_tower and self.local_loss:
+            all_text_features = None
+        else:
+            all_text_features = self.gather_features(
+                text_features)
+        all_image_features = self.gather_features(
+            image_features)
+
+        if self.local_loss:
+            if all_text_features is not None:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+            logits_per_text = logit_scale * text_features @ all_image_features.T
+        else:
+            # 如果是global_loss，那all_text_features肯定不是空的
+            logits_per_image = logit_scale * all_image_features @ all_text_features.T
+            logits_per_text = logits_per_image.T
+
+        num_logits = logits_per_text.shape[0]
+        if self.prev_num_logits != num_logits or self.device not in self.labels:
+            labels = torch.arange(num_logits, device=self.device, dtype=torch.long)
+            if self.trainer.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.global_rank
+            if self.cache_labels:
+                self.labels[self.device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[self.device]
+
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2 if logits_per_image is not None else F.cross_entropy(logits_per_text, labels)
+        return total_loss
+
+    def training_step(self, batch):
+        image, text, _ = batch
+        image_features, text_features, logit_scale = self(image, text)
+        total_loss = self.clip_loss(image_features, text_features, logit_scale)
+        self.log('train_loss', total_loss, sync_dist=False)
+        return total_loss
+
+    def on_train_batch_end(self, outputs, batch, batch_idx: int) -> None:
+        with torch.no_grad():
+            self.model.logit_scale.clamp_(0, math.log(100))
+
+    def get_metrics(self, image_features, text_features, labels, logit_scale):
+        # 计算相似度，支持多个样本的情况（比如一个图片有多个caption）
+        # img2txt计算的时候要用到，因为一张图片可能对应多个文本。
+        # txt2img计算的时候不需要（一般一个text只有一个对应图片）
+        metrics = {}
+        logits_per_image = (logit_scale * image_features @ text_features.t()).detach().cpu()
+        logits_per_text = logits_per_image.t().detach().cpu()
+
+        logits = {"image_to_text": logits_per_image, "text_to_image": logits_per_text}
+
+        label2idx = {}  # 计算label到idx的映射。
+        repeat_id = []
+        for i, label in enumerate(labels):
+            if label not in label2idx:
+                label2idx[label] = [i]
+            else:
+                # 表示该index的标签出现过，记录这个index，后续算txt2img分数的时候，这些index的权值要降低。
+                label2idx[label].append(i)
+                repeat_id.append(i)
+
+        ground_truth = [label2idx[label] for label in labels]
+
+        for name, logit in logits.items():
+            if name == 'text_to_image':
+                logit[:, repeat_id] -= 1e8   # 这部分的分数要降低。（重复出现的图片，直接忽略）
+            r_stat = {1: [], 5: [], 10: []}
+            # r1_stat, r5_stat, r10_stat = [], [], []
+            # index of the largest element to the smallest
+            ranking = torch.argsort(logit, descending=True)
+            for i, each_query in enumerate(ranking[:, :10]):
+                for j, q in enumerate(each_query):
+                    found = False
+                    if q in ground_truth[i]:
+                        for k, v in r_stat.items():
+                            if j < k:
+                                found = True
+                                v.append(1)
+                    if found:
+                        break
+            for k, v in r_stat.items():
+                metrics[f'{name}_R@{k}'] = sum(v)/len(logit)
+        return metrics
+
+    def validation_step(self, batch, batch_idx):
+        image, text, label = batch
+        image_features, text_features, logit_scale = self(image, text)
+        return image_features, text_features, logit_scale, text['input_ids'].shape[0], label
+
+    def validation_epoch_end(self, val_outputs):
+        all_image_features = []
+        all_text_features = []
+        all_labels = []
+        sample_size = 0
+        for o in val_outputs:
+            all_image_features.append(o[0])
+            all_text_features.append(o[1])
+            sample_size += o[3]
+            all_labels += o[4]
+        if len(all_image_features) == 0 or len(all_text_features) == 0:
+            return
+        all_image_features = torch.cat(all_image_features)
+        all_text_features = torch.cat(all_text_features)
+        logit_scale = val_outputs[0][2].mean()
+        logits_per_image = logit_scale * all_image_features @ all_text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        labels = torch.arange(sample_size, device=self.device).long()
+        total_loss = (F.cross_entropy(logits_per_image, labels)
+                      + F.cross_entropy(logits_per_text, labels)) / 2
+
+        val_metrics = self.get_metrics(
+            image_features=all_image_features,
+            text_features=all_text_features,
+            logit_scale=logit_scale,
+            labels=all_labels)
+        loss = total_loss / sample_size
+        self.log('val_loss', loss, sync_dist=False)
+        for k, v in val_metrics.items():
+            self.log(f'val_{k}', v, sync_dist=False)
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # 保存的时候把权重按huggingface的形式保存出来
+        if self.global_rank == 0:
+            dir_path = os.path.join(
+                self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}')
+            if not os.path.exists(dir_path):
+                os.mkdir(dir_path)
+            self.model.save_pretrained(dir_path)
+            self.processor.save_pretrained(dir_path)
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = add_data_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = TaiyiCLIP.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    model = TaiyiCLIP(args)
+    processor = model.processor
+    collate_fn = Collator(args, processor)
+    datasets = load_data(args, global_rank=trainer.global_rank)
+
+    # 加载单个验证集：！！！验证代码有效性临时这样干的，验证完有效性会删除
+    from fengshen.examples.pretrain_taiyi_clip.flickr_datasets import flickr30k_CNA
+    img_root = '/shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images'
+    text_annot_path = '/shared_space/ccnl/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt'
+
+    datasets[args.val_datasets_field] = flickr30k_CNA(img_root, text_annot_path, collate_fn)
+
+    datamoule = UniversalDataModule(
+        tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets)
+
+    trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/pretrain_taiyi_clip/test.py b/fengshen/examples/pretrain_taiyi_clip/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5927a8688618678c8838162bf0c42fac6067e19
--- /dev/null
+++ b/fengshen/examples/pretrain_taiyi_clip/test.py
@@ -0,0 +1,36 @@
+from pytorch_lightning import (
+    Trainer,
+)
+from fengshen.models.model_utils import (
+    add_module_args,
+)
+import argparse
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.examples.pretrain_taiyi_clip.pretrain import (
+    TaiyiCLIP,
+    Collator,
+)
+from fengshen.data.fs_datasets import load_dataset
+from torch.utils.data import DataLoader
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = TaiyiCLIP.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+    checkpoint_callback = UniversalCheckpoint(args)
+    trainer = Trainer.from_argparse_args(args, callbacks=[
+        checkpoint_callback
+    ])
+
+    model = TaiyiCLIP(args)
+    processor = model.processor
+    collate_fn = Collator(processor)
+    datasets = load_dataset(args.datasets_name)
+    dataloader = DataLoader(datasets[args.test_datasets_field],
+                            batch_size=args.test_batchsize, num_workers=2, collate_fn=collate_fn)
+    trainer.validate(model, dataloaders=dataloader, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/pretrain_taiyi_clip/test.sh b/fengshen/examples/pretrain_taiyi_clip/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..729fa870407ec42b5cd48872c6acb9f5a4c8bf4f
--- /dev/null
+++ b/fengshen/examples/pretrain_taiyi_clip/test.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#SBATCH --job-name=finetune_taiyi # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=8 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:8 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=64
+
+DATA_ARGS="\
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --datasets_name flickr30k-CNA \
+        "
+
+MODEL_ARGS="\
+        --model_path /cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/workspace/taiyi-clip-huge-v2/hf_out_0_661 \
+        "
+
+TRAINER_ARGS="\
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy ddp \
+        --log_every_n_steps 0 \
+        --default_root_dir . \
+        --precision 32 \
+        "
+# num_sanity_val_steps， limit_val_batches 通过这俩参数把validation关了
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $TRAINER_ARGS \
+        "
+
+CUDA_VISIBLE_DEVICES=0 python3 test.py $options
+#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain.py $options
diff --git a/fengshen/examples/qa_t5/README.md b/fengshen/examples/qa_t5/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fffd0ac176970683240127ce9f7b29c0f0e0ea97
--- /dev/null
+++ b/fengshen/examples/qa_t5/README.md
@@ -0,0 +1,98 @@
+# 燃灯系列-T5问答模型微调
+## 简介 Brief Introduction
+ Here are codes for finetuning Randeng-T5-QA-Chinese. The model was pretrained on the Wudao 180G corpus, and finetuned on Chinese SQuAD and CMRC2018 dataset. It can produce a fluent and accurate answer given a passage and question.
+
+这是中文的生成式问答模型[Randeng-T5-QA-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-T5-784M-QA-Chinese)的微调代码。它基于T5-Large结构，使用悟道180G语料在[封神框架](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen)进行预训练，在ChineseSQuAD和CMRC2018两个阅读理解数据集上进行微调。输入一篇文章和一个问题，可以生成准确流畅的回答。
+
+## 模型类别 Model Taxonomy
+
+|  需求 Demand  | 任务 Task       | 系列 Series      | 模型 Model    | 参数 Parameter | 额外 Extra |
+|  :----:  | :----:  | :----:  | :----:  | :----:  | :----:  |
+| 通用 General | 自然语言转换 NLT | 燃灯 Randeng | T5 |      784M      |     中文生成式问答 -Chinese Generative Qustion Answering   |
+
+模型架构
+| 配置 | 参数 |
+| ---- | ---- |
+| encoder layers | 12 |
+| encoder_attention_heads | 16 |
+| encoder_ffn_dim | 2816 |
+| decoder layers | 24 |
+| decoder_attention_heads| 16 |
+| decoder_ffn_dim | 2816 |
+| max_encode_length | 1024 |
+
+## 模型表现 Performance 
+
+ CMRC 2018的测试集上的效果（原始任务是一个起始和结束预测问题，这里作为一个生成回答的问题）
+  
+   | model | Contain Answer Rate| RougeL | BLEU-4 |F1 | EM | 
+   |-------|----|----|--------------------|--------|--------|
+   | Ours | 76.0 | 82.7 |61.1|77.9 |57.1|
+  
+   
+   Our model enjoys a high level of generation quality and accuracy, with 76% of generated answers containing the ground truth. The high RougeL and BLEU-4 reveal the overlap between generated results and ground truth. Our model has a lower EM because it generates complete sentences while golden answers are segmentations of sentences. 
+
+   我们的模型有着极高的生成质量和准确率，76%的回答包含了正确答案(Contain Answer Rate)。RougeL和BLEU-4反映了模型预测结果和标准答案重合的程度。我们的模型EM值较低，因为生成的大部分为完整的句子，而标准答案通常是句子片段。
+
+
+## 模型
+
+T5-Large: [Randeng-T5-784M-QA-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-T5-784M-QA-Chinese)
+
+文件：
+ - qa_dataset.py 数据集的处理，包含dataset和dataloader
+ - finetune_t5_cmrc.py 模型微调核心代码
+ - run_finetune.sh, 微调脚本(未安装deepspeed的话strategy参数改为ddp)
+ - run_predict2.sh 预测脚本
+
+## 使用 Usage
+
+```python
+import numpy as np
+from transformers import T5Tokenizer,MT5ForConditionalGeneration
+
+pretrain_path = 'IDEA-CCNL/Randeng-T5-784M-QA-Chinese'
+tokenizer=T5Tokenizer.from_pretrained(pretrain_path)
+model=MT5ForConditionalGeneration.from_pretrained(pretrain_path)
+
+sample={"context":"在柏林,胡格诺派教徒创建了两个新的社区:多罗西恩斯塔特和弗里德里希斯塔特。到1700年,这个城市五分之一的人口讲法语。柏林胡格诺派在他们的教堂服务中保留了将近一个世纪的法语。他们最终决定改用德语,以抗议1806-1807年拿破仑占领普鲁士。他们的许多后代都有显赫的地位。成立了几个教会,如弗雷德里夏(丹麦)、柏林、斯德哥尔摩、汉堡、法兰克福、赫尔辛基和埃姆登的教会。","question":"除了多罗西恩斯塔特,柏林还有哪个新的社区?","idx":1}
+plain_text='question:'+sample['question']+'knowledge:'+sample['context'][:self.max_knowledge_length]
+
+res_prefix=tokenizer.encode('answer'+'<extra_id_0></s>',add_special_token=False)
+l_rp=len(res_prefix)
+
+tokenized=tokenizer.encode(plain_text,add_special_tokens=False,truncation=True,max_length=self.max_seq_length-2-l_rp)
+
+tokenized+=res_prefix
+
+# Generate answer
+pred_ids = model.generate(input_ids=tokenized,max_new_token=self.max_target_length,do_sample=True,top_p=0.9)
+tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+```
+
+## 引用 Citation
+如果您在您的工作中使用了我们的模型，可以引用我们的[论文](https://arxiv.org/abs/2210.08590)：
+
+If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2210.08590):
+
+```text
+@article{fengshenbang,
+  author    = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang},
+  title     = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence},
+  journal   = {CoRR},
+  volume    = {abs/2209.02970},
+  year      = {2022}
+}
+```
+
+You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+欢迎引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+```text
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2021},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
\ No newline at end of file
diff --git a/fengshen/examples/qa_t5/example_data.json b/fengshen/examples/qa_t5/example_data.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b9c12d9f4eb60a10cb316504d8752097b3eacd7
--- /dev/null
+++ b/fengshen/examples/qa_t5/example_data.json
@@ -0,0 +1,5 @@
+{"context": "郑思肖，原名郑之因，祖籍连江（今福建福州连江县），字忆翁，号所南，又号三外野人，宋末元初画家、诗人。南宋亡国后，孤身隐居苏州，终身未娶。郑思肖生于南宋理宗淳祐元年（1241年）。祖父郑咸曾任枝江县主簿。父亲郑起，字叔起，号菊山，从事教书生涯。母亲楼氏。南宋末年，通过科举，考中秀才，为理宗时的太学上舍，应博学宏词科。咸淳三年（1267年） 元人南下攻宋，咸淳九年（1273年）襄阳失守。郑思肖献策抵抗，因“辞切直，忤当路”，未被采纳。祥兴二年（1279年）南宋灭亡，郑思肖隐居吴中（今江苏苏州），改名思肖（因宋朝皇帝姓赵），改字忆翁；为了寄托爱国情怀，郑思肖坐卧必向南，并自号“所南”；所居之处命名为“本穴世界”（把“本”字中的“十”置于“穴”字中，便是“大宋”，以词寓意其乃大宋遗民，不忘故国。元仁宗延祐五年（1318年），郑思肖于苏州觉报寺内逝世。郑思肖专工画兰，特征为花和叶萧疏，画兰不画土地和根，寓意宋朝沦亡。其存世作品有《国香图卷》、《墨兰图卷》、《墨兰图》等。其中《墨兰图卷》藏于日本大阪市立美术馆，《墨兰图》藏于美国耶鲁大学美术馆。郑思肖在《画菊》中托物言志，以菊花自比，隐含了诗人的人生遭际和理想追求。这首诗的意思是：菊花不与百花为丛，独立却意趣未穷，宁愿在枝头上枯死、遗留芬芳，也不向元朝（北风）投降。常用于表达高尚的民族气节。2017年6月中国国民党主席洪秀柱在海峡论坛的讲话中引用了这首诗的后两句，说“现实轨迹总未必是尽如人意，但逆境才能锤炼出钢铁的意志”。郑思肖的作品有《郑所南先生文集》、《一百二十图诗集》、《心史》等。其中《心史》是郑思肖在南宋灭亡之后写下的一部诗文总集，分上下两卷。史学家陈寅恪很推崇郑思肖，在《柳如是别传》中曾写“所南心史，固非吴井之藏；孙盛阳秋，同是辽东旧本。”郭沫若在抗日战争期间写的《国画中的民族意识》中，称赞郑思肖是“民族意识浓烈的人”。", "answer": ["为了寄托爱国情怀"], "question": "祥兴二年（1279年）以后，郑思肖为何坐卧必向南，并自号“所南”", "idx": 10045, "ans_span": [[267, 275]]}
+{"context": "网络信标（web beacon）也称网页臭虫（web bug），是可以暗藏在任何网页元素或邮件内的1像素大小的透明GIF或PNG图片，常用来收集目标电脑用户的上网习惯等数据，并将这些数据写入Cookie。网络信标和垃圾邮件中较为常用。网络臭虫（Web bug）也称为网络信标（Web beacon），是一个放置在网页或电子邮件上的文件对象，用于监测用户的行为。它不像Cookie那样可以被浏览器用户接受或拒绝，网络臭虫只以图形交换格式（GIF）或其他文件对象的形式出现。它通常只能被检测，如果用户查看网页的源版本会发现一个从不同的Web服务器而不是从网页的其他部分负载的标签。虽然互联网隐私倡导者反对使用网络臭虫，但是他们大部分承认网络臭虫有积极用途，例如跟踪侵犯版权的网站。根据Richard M.Smith，网络臭虫（Web bug）可以收集以下资料：网络臭虫（Web bug）经常被垃圾邮件发送者用来验证电子邮件地址。当收件人打开一封有网络臭虫的电子邮件时，返回给发件人的信息就会显示邮件已被打开，这样就可以确认电子邮件地址是有效的。信标API（Beacon API）是一种较新的Web技术，它不需要使用不可见图像或类似手段就能达到相同的目的。，它还是一个万维网联盟的候选建议。其旨在使Web开发人员能在用户离开页面时将信息（如分析或诊断数据）发回Web服务器，以跟踪用户的活动。使用Web信标API能够不干扰或影响网站导航的完成此种跟踪，并且对最终用户不可见。信标API已于2014年被相继引入到Mozilla Firefox和Google Chrome网页浏览器。", "answer": ["图形交换格式（GIF）或其他文件对象的形式出现"], "question": "网络臭虫以什么样的形式出现在网页中？", "idx": 9359, "ans_span": [[211, 234]]}
+{"context": "10号球，是一种新兴的花式撞球运动项目，其基本玩法与9号球类似，但多了一颗10号子球，而且击球前必须先指定球、指定袋，所以困难度提高很多，颇具发展潜力。参赛双方比球决定第一局谁先开球，此后各局采轮流开球制。排球时10颗子球紧密排成三角形，1号球在前端，并位于脚点上，10号球在三角形中间，其他各球位置不限。开球前应将母球放置于发球线后，并以球杆撞击母球使其先碰到1号球。开球时若无子球进袋，至少应有4颗子球触碰台边，否则即为犯规。开完球的第一杆可以做push out，要使用必须事先声明。所谓push out是指可以把母球推到任何一个位置，不受先碰到号码最小的球这条规则限制，你想把球打进也可以（但是10号打进要捡起来放回脚点）。若母球落袋一样算犯规，此时对手可以选择打或是不打。若开完球之后，无法打到目标球，选手通常会作push out，然后双方进入防守战。击球前必须先表明要将那一球打进那一袋，即所谓「指定球、指定袋」。若是要进的球很明确，则可省略指定球；但无论何种状况，均不可省略指定袋。至于子球碰撞颗星或与其他球的碰撞方式，均不必说明。每次击球时，母球必须先碰撞台面上号码最小的球，才算合法击球。当击球者将正确的球打进正确的袋后，始得以继续击球；否则换对手击球。若是指定球进错袋，或者进错球，也要换对手击球；但此时对手可以选择打或不打。比赛中先将10号球打进袋者赢得该局击球者若发生以下情形，将换由对手发自由球，亦即可将母球放置于台面上的任何位置再行击球。", "answer": ["击球前必须先表明要将那一球打进那一袋"], "question": "指定球、指定袋又是指什么？", "idx": 7246, "ans_span": [[380, 398]]}
+{"context": "分析机是由英国数学家查尔斯·巴贝奇设计的一种机械式通用计算机。从1837年首次提出这种机器的设计，一直到他去世的1871年，由于种种原因，这种机器并没有被真正的制造出来。但它本身的设计逻辑却十分先进，是大约100年后电子通用计算机的先驱。查尔斯·巴贝奇最初尝试的所谓差分机，可以通过求解差分来计算对数表和三角函数表，然后能近似计算多项式。由于巴贝奇与他的首席工程师起了争执，英国政府就撤回了这项项目的资金，差分机也因此没能完成。在这期间，巴贝奇意识到建造一种更加通用的机器（即所谓的分析机）是可行的，于是便于1833年开始了分析机的设计。分析机由蒸汽机驱动，大约有30米长、10米宽。它的输入由程序和数据组成，并使用打孔卡输入，这种输入方法被当时的织布机广泛采用。分析机通过一台打印机、一个弯曲的绘图仪和一个铃铛输出，也可以在纸上打孔以便日后读取。分析机采取普通的十进制定点计数法。它的“记忆体”大约可以存储1000个40位的十进制数（每个数约16.2kB）。有一个算术逻辑单元可以进行四则运算、比较和求平方根操作。刚开始研制的时候，分析机的外观被普遍认为和差分机相似 。1858年的图纸呈现了一个有规律的网格布局。与现代计算机的中央处理器（CPU）类似，其算术逻辑单元使用的微程序存储在插在被称为“桶”的滚筒上的支柱中，这为用户指定更加复杂的运算提供了便利。分析机使用的编程语言与今天的汇编语言类似，支持循环语句和条件分支，因此这门语言被认为是图灵完备的。分析机采用三种不同的打孔卡和读卡器来区分算术运算、数字常量和存储的指令，以此实现了数字在存储器和运算单元之间的加载和存储操作。巴比奇在1837至1840年间写下了24份程序，并在之后又写了一份。这些程序可以计算多项式、迭代公式、高斯消去法和伯努利数。", "answer": ["1837年"], "question": "分析机设计首次提出是什么时候？", "idx": 3905, "ans_span": [[32, 37]]}
+{"context": "金文秀（Kim Moon-Soo，），已退休韩国男子羽毛球运动员。金文秀曾经两度赢得世界锦标赛男子双打冠军。他也曾赢得1面奥运会男子双打金牌，及3次全英公开赛男子双打冠军。这些荣誉均是与同胞朴柱奉共同获得，那也是他羽毛球生涯中的主要搭档。2002年，他被选入羽毛球名人堂。金文秀曾于1992年夏季奥林匹克运动会羽毛球比赛代表韩国出赛。他与朴柱奉搭档参加男子双打项目，在决赛中以15-11, 15-7击败来自印尼的洪忠中、郭宏源组合而夺得金牌。在伦敦奥运会的女双赛事发生「消极比赛事件」，事后，韩国羽协重罚涉事的教练和球员。身为女双教练的金文秀被韩国羽协剥夺教练资格，此后不能在韩国国内俱乐部队执教。", "answer": ["朴柱奉"], "question": "金文秀1992年和谁一起参加了男子双打项目？", "idx": 8479, "ans_span": [[95, 98]]}
\ No newline at end of file
diff --git a/fengshen/examples/qa_t5/finetune_t5_cmrc.py b/fengshen/examples/qa_t5/finetune_t5_cmrc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8f2c30254f7b907921a83a07dba802279838ac9
--- /dev/null
+++ b/fengshen/examples/qa_t5/finetune_t5_cmrc.py
@@ -0,0 +1,450 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   finetune_t5_cmrc.py
+@Time    :   2022/10/28 19:57
+@Author  :   He Junqing
+@Version :   1.0
+@Contact :   hejunqing@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+# here put the import lib
+
+import pytorch_lightning as pl
+import os
+import sys
+import time
+import torch
+import argparse
+from collections import Counter
+from fengshen.utils.utils import chinese_char_tokenize
+from fengshen.data.universal_datamodule import UniversalDataModule
+from pytorch_lightning import Trainer, loggers
+from pytorch_lightning.callbacks import LearningRateMonitor
+from transformers import MT5ForConditionalGeneration, T5Tokenizer, MT5Config
+from torchmetrics.text.rouge import ROUGEScore
+from nltk.translate.bleu_score import corpus_bleu
+
+torch.cuda.empty_cache()
+
+
+class QAFinetuneModel(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group("BaseModel")
+        parser.add_argument("--prediction_res_path", default=None, type=str)
+        parser.add_argument(
+            "--decode_strategy",
+            default="greedy",
+            choices=["beamsearch", "sampling", "greedy"],
+        )
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.formator = args.formator
+        self.max_target_length = args.max_target_length
+        self.decode_strategy = args.decode_strategy
+        self.rouge_metric = ROUGEScore(
+            rouge_keys=("rougeL", "rouge1", "rouge2"), normalizer=lambda x: x
+        )
+        self.loss_func = torch.nn.CrossEntropyLoss(reduction="none")
+
+        self.model = MT5ForConditionalGeneration.from_pretrained(
+            args.pretrained_model_path
+        )
+        print("using MT5 model")
+
+        if args.tokenizer_type == "t5_tokenizer":
+            self.tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path)
+            print("vocab_size:", len(self.tokenizer))
+            # self.tokenizer.add_special_tokens(special_token_dict)
+            # print('add special tokens to tokenizer,vocab size:',len(self.tokenizer))
+        else:
+            print("now only the t5_tokenizer is supported")
+        self.bleu_val = []
+
+    def setup(self, stage=None) -> None:
+
+        if stage == "fit":
+            train_loader = (
+                self.trainer._data_connector._train_dataloader_source.dataloader()
+            )
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches * float(
+                    self.trainer.max_epochs
+                )
+                self.total_steps = (
+                    len(train_loader.dataset) * self.trainer.max_epochs // tb_size
+                ) // ab_size
+            else:
+                self.total_steps = (
+                    self.trainer.max_steps // self.trainer.accumulate_grad_batches
+                )
+
+            print("Total steps: {}".format(self.total_steps))
+        # return super().setup(stage)
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+
+        return configure_optimizers(self)
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # Save the current loop info in the mid of epoch
+        # if you lightning <= 1.6.0  uncomment the line below
+        # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict()
+        if (
+            self.trainer.global_rank == 0
+            and self.trainer.global_step % self.hparams.every_n_train_steps == 0
+        ):
+            self.model.save_pretrained(
+                os.path.join(
+                    self.trainer.checkpoint_callback.dirpath,
+                    "hf_pretrained_epoch{}_step{}".format(
+                        self.trainer.current_epoch, self.trainer.global_step
+                    ),
+                )
+            )
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        global_step_offset = checkpoint["global_step"]
+        if "global_samples" in checkpoint:
+            self.consumed_samples = checkpoint["global_samples"]
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+    def training_step(self, batch, batch_idx):  # todo: change
+        if self.formator == "t5style":
+            output = self.model(
+                input_ids=batch["input_ids"],
+                labels=batch["labels"],
+                decoder_input_ids=batch["decoder_input_ids"],
+            )
+        else:
+            output = self.model(
+                input_ids=batch["input_ids"],
+                input_token_type=batch["token_types"],
+                labels=batch["labels"],
+                decoder_input_ids=batch["decoder_input_ids"],
+            )
+        # print(output.logits)
+        acc = self.comput_metrix(output.logits, batch["labels"])
+        grad = get_gradient_norm(self.model)
+        self.log("train_loss", output.loss, sync_dist=True)
+        self.log("train_acc", acc, sync_dist=True)
+        self.log("train_grad", grad, sync_dist=True)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch["input_ids"],
+            labels=batch["labels"],
+        )
+        pred_ids = self.model.generate(
+            input_ids=batch["input_ids"], max_new_tokens=self.max_target_length
+        )
+
+        acc = self.comput_metrix(output.logits, batch["labels"])
+        # print(output.logits.shape)
+        self.log("val_loss", output.loss, sync_dist=True)
+        self.log("val_acc", acc, sync_dist=True)
+        batch_labels = torch.where(
+            batch["labels"] != -100, batch["labels"], self.tokenizer.pad_token_id
+        )
+
+        ppl = torch.exp(output.loss)
+        self.log("val_ppl", ppl, sync_dist=True)
+        pred_tokens = self.tokenizer.batch_decode(
+            pred_ids, cleanup_tokenization_space=True, skip_special_tokens=True
+        )
+        label_tokens = self.tokenizer.batch_decode(
+            batch_labels, cleanup_tokenization_space=True, skip_special_tokens=True
+        )
+        pred_sentences = list(map(remove_pad, pred_tokens))
+        # print(label_tokens)
+        self.bleu_val.append(compute_bleu(pred_sentences, [[t] for t in label_tokens]))
+        candidate = [
+            chinese_char_tokenize(p).lstrip("<extra_id_0>") for p in pred_tokens
+        ]
+        target = [
+            generate_sentence(chinese_char_tokenize(sent)).lstrip("<extra_id_0>")
+            for sent in label_tokens
+        ]
+        self.rouge_metric.update(preds=candidate, target=target)
+        f1 = compute_f1(candidate, label_tokens)
+        self.log("val_f1", f1, sync_dist=True)
+
+    def on_validation_epoch_end(self) -> None:
+        n = len(self.bleu_val)
+        avg_bleu = float(sum(self.bleu_val)) / n
+        print("bleu:", avg_bleu)
+        self.log("val_bleu", avg_bleu)
+        self.bleu_val = []
+        rouge_dict = self.rouge_metric.compute()
+        # reset the metric after once validation
+        self.rouge_metric.reset()
+        for k, v in rouge_dict.items():
+            self.log("val_{}".format(k), v, sync_dist=True)
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            print("rouge:\n", rouge_dict)
+        return
+
+    def predict_step(self, batch, batch_idx):
+        num_beams = 1
+        do_sample = False
+        top_p = None
+        if self.decode_strategy == "beamsearch":
+            num_beams = 10
+        elif self.decode_strategy == "sampling":
+            num_beams = 4
+            top_p = 0.9
+            do_sample = True
+
+        prediction_dic = self.model.generate(
+            input_ids=batch["input_ids"],
+            max_new_tokens=self.max_target_length,
+            num_beams=num_beams,
+            do_sample=do_sample,
+            top_p=top_p,
+            no_repeat_ngram_size=3,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+        output = self.model(
+            input_ids=batch["input_ids"],
+            labels=batch["labels"],
+        )
+        prediction_ids = prediction_dic["sequences"]
+        loss_tensor = self.loss_func(output.logits.transpose(1, 2), batch["labels"])
+        indexes = torch.where(batch["labels"] == self.tokenizer.eos_token_id)[1]
+        loss = torch.sum(loss_tensor, dim=1) / indexes
+        return {
+            "input_ids": batch["input_ids"],
+            "predict_ids": prediction_ids,
+            "labels": batch["labels"],
+            "decoder_inputs": batch["decoder_input_ids"],
+            "loss": loss,
+        }
+
+    def save_preditions(self, result, args):
+        with open(args.prediction_res_path, "w", encoding="utf8") as fw:
+            preditions = []
+            labels = []
+            for batch in result:
+                print(batch.keys())
+                batch_labels = torch.where(
+                    batch["labels"] != -100,
+                    batch["labels"],
+                    self.tokenizer.pad_token_id,
+                )
+                for i in range(len(batch["input_ids"])):
+                    context = self.tokenizer.decode(
+                        batch["input_ids"][i],
+                        skip_special_tokens=True,
+                        cleanup_tokenization_space=True,
+                    )
+                    pred = self.tokenizer.decode(
+                        batch["predict_ids"][i],
+                        cleanup_tokenization_space=True,
+                        skip_special_tokens=True,
+                    )
+                    target = generate_sentence(
+                        self.tokenizer.batch_decode(
+                            batch_labels[i], cleanup_tokenization_space=True
+                        )
+                    )
+                    pred = pred.lstrip("<extra_id_0>")
+                    target = target.lstrip("<extra_id_0>")
+                    self.rouge_metric.update(
+                        preds=chinese_char_tokenize(pred),
+                        target=chinese_char_tokenize(target),
+                    )
+                    preditions.append(list(pred))
+                    labels.append([list(target)])
+                    fw.write("context:" + "".join(context) + "\n")
+                    fw.write("pred:" + pred + "\n")
+                    fw.write("target" + target + "\n")
+                    fw.write("loss:{:.6f}\n".format(batch["loss"][i].item()))
+                    fw.write("\n")
+            bleu = compute_bleu(preditions, labels)
+            fw.write("bleu:{}".format(bleu))
+        print("finish prediction, saved in {}".format(args.prediction_res_path))
+        return preditions, labels
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_true = labels.float()
+        pad_num = torch.sum(torch.eq(labels, -100))
+        corr = torch.eq(y_pred, y_true)
+        acc = (torch.sum(corr.float()) - pad_num) / (
+            y_true.view(size=(-1,)).shape[0] - pad_num
+        )
+        return acc
+
+
+class PredictDataModule(UniversalDataModule):
+
+    def predict_dataloader(self):
+        return self.test_dataloader()
+
+
+def main():
+
+    total_parser = argparse.ArgumentParser("Finetune Dialogue model.")
+    total_parser.add_argument("--do_eval_only", action="store_true", default=False)
+    total_parser.add_argument("--pretrained_model_path", default=None, type=str)
+    total_parser.add_argument("--new_vocab_path", default=None, type=str)
+    total_parser.add_argument(
+        "--tokenizer_type",
+        default="t5_tokenizer",
+        choices=["t5_tokenizer", "bert_tokenizer"],
+    )
+    total_parser.add_argument("--train_split_size", default=0.995, type=int)
+    total_parser.add_argument("--preprocessing_num_workers", default="10", type=int)
+    total_parser.add_argument("--ckpt_path", default=None, type=str)
+    total_parser.add_argument("--use_cache", default=False, type=bool)
+    total_parser.add_argument(
+        "--formator", default="dialog", choices=["dialog", "ccqa", "t5style"]
+    )
+
+    sys.path.append("../../../")
+
+    from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+    from qa_dataset import T5StyleDataset, TextGenCollator
+
+    total_parser = T5StyleDataset.add_data_specific_args(total_parser)
+    total_parser = UniversalDataModule.add_data_specific_args(
+        total_parser
+    )  # TaskDataModel
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+    total_parser = QAFinetuneModel.add_model_specific_args(
+        total_parser
+    )  # todo: check names
+
+    args = total_parser.parse_args()
+    print("Argument parse success.")
+    print("superviseT5DataModel load start {}".format(get_time_str()))
+
+    config = MT5Config.from_pretrained(args.pretrained_model_path)
+    collate_fn = TextGenCollator(
+        config=config,
+        pad_token_id=config.pad_token_id,
+        decoder_start_token_id=config.decoder_start_token_id,
+        formator=args.formator)
+    if not args.do_eval_only:
+        datasets = {'train': T5StyleDataset(args.train_file, args, load_data_type=0, data="train"),
+                    'validation': T5StyleDataset(args.val_file, args, load_data_type=0, data="dev")}
+
+        model = QAFinetuneModel(args)
+        print("superviseT5DataModel load end {}".format(get_time_str()))
+
+        data_model = UniversalDataModule(
+            tokenizer=None, args=args, collate_fn=collate_fn, datasets=datasets
+        )
+        print('data loaded')
+        checkpoint_callback = UniversalCheckpoint(args)
+        lr_monitor = LearningRateMonitor(logging_interval="step")
+        logger = loggers.TensorBoardLogger(
+            save_dir=os.path.join(args.default_root_dir, "logs/")  # TOCHANGE
+        )
+        trainer = Trainer.from_argparse_args(
+            args, logger=logger, callbacks=[checkpoint_callback, lr_monitor]
+        )
+        trainer.fit(model, data_model)
+    else:
+        datasets = {'test': T5StyleDataset(args.test_file, args, load_data_type=0, data="test")}
+
+        data_model = PredictDataModule(
+            tokenizer=None, args=args, collate_fn=collate_fn, datasets=datasets
+        )
+
+        tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path)
+        model = QAFinetuneModel(args=args)
+        trainer = Trainer.from_argparse_args(args)
+        result = trainer.predict(model, data_model, ckpt_path=args.ckpt_path)
+        predictions, labels = model.save_preditions(result, args)
+        sample = result[0]  # first_batch
+        batch_labels = torch.where(
+            sample["labels"] != -100, sample["labels"], model.tokenizer.pad_token_id
+        )
+        for i in range(4):
+            print(tokenizer.batch_decode(sample["input_ids"][i]))
+            print(tokenizer.batch_decode(sample["predict_ids"][i]))
+            print(tokenizer.batch_decode(batch_labels[i]))
+
+
+def compute_f1(cand, ref):
+    f1_score = []
+    for p, t in zip(cand, ref):
+        p_tokens = p.split()
+        t_tokens = t.split()
+        common = Counter() & Counter(t.split())
+        num_same = sum(common.values())
+        if len(t_tokens) == 0 or len(p_tokens) == 0:
+            f1 = int(p == t)
+        elif num_same == 0:
+            f1 = 0
+        else:
+            precision = 1.0 * num_same / len(p_tokens)
+            recall = 1.0 * num_same / len(t_tokens)
+            f1 = (2 * precision * recall) / (precision + recall + 1e-8)
+            f1_score.append(f1)
+        f1 = sum(f1_score) / float(len(cand))
+        return f1
+
+
+def generate_sentence(raw_list):
+    words = []
+    i = 0
+    while i < len(raw_list) and raw_list[i] != "</s>":
+        words.append(raw_list[i])
+        i += 1
+    return "".join(words)
+
+
+def remove_pad(raw_text, ref=False):
+    if ref:
+        return [raw_text.lstrip("<pad>")]
+    else:
+        return raw_text.lstrip("<pad>")
+
+
+def compute_bleu(preditions, labels):
+
+    score_nltk = corpus_bleu(labels, preditions)
+    return score_nltk
+
+
+def get_gradient_norm(model):
+    total_norm = 0
+    parameters = [
+        p for p in model.parameters() if p.grad is not None and p.requires_grad
+    ]
+    for p in parameters:
+        param_norm = p.grad.detach().data.norm(2)
+        total_norm += param_norm.item() ** 2
+    total_norm = total_norm**0.5
+    return total_norm
+
+
+def get_time_str():
+    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/qa_t5/qa_dataset.py b/fengshen/examples/qa_t5/qa_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d395fa798f5b3b9c3eaf33cd5dcca1ff67722a
--- /dev/null
+++ b/fengshen/examples/qa_t5/qa_dataset.py
@@ -0,0 +1,187 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   qa_dataset.py
+@Time    :   2022/10/28 19:57
+@Author  :   He Junqing
+@Version :   1.0
+@Contact :   hejunqing@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+# here put the import lib
+
+from dataclasses import dataclass
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from fengshen.data.t5_dataloader.t5_gen_datasets import DialogDataset
+
+
+class T5StyleDataset(DialogDataset):
+
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group("Dataset")
+        parser.add_argument("--max_seq_length", default=512, type=int)
+        parser.add_argument("--max_knowledge_length", default=128, type=int)
+        parser.add_argument("--max_target_length", default=128, type=int)
+        return parent_args
+
+    def regular_tokenize(self, sample):
+        """
+        sample.keys:question:str,context:stc, answer:[],idx:int,ans_span:[]
+        """
+        plain_text = (
+            "question:"
+            + sample["question"]
+            + "knowledge:"
+            + sample["context"][: self.max_knowledge_length]
+        )
+        l_text = len(plain_text)
+
+        ctx_len = self.max_seq_length - l_text - 1
+        if ctx_len > 0 and "history" in sample:
+            context = "[SEP]".join(sample["history"])
+            plain_text += "context:" + context
+
+        res_prefix = self.tokenizer.encode("answer:", add_special_tokens=False)
+        # res_prefix.tolist()
+        l_rp = len(res_prefix)
+
+        tokenized = self.tokenizer.encode(
+            plain_text,
+            add_special_tokens=False,
+            truncation=True,
+            max_length=self.max_seq_length - 2 - l_rp,
+        )
+        # tokenized.tolist()
+        tokenized += res_prefix
+        # add maskid
+        mask_id = self.tokenizer.convert_tokens_to_ids("<extra_id_0>")
+        tokenized.append(mask_id)
+        tokenized.append(self.eos_token_id)
+        # print(tokenized)
+
+        target_ids = self.tokenizer.encode(
+            "<extra_id_0>" + sample["answer"][0],
+            add_special_tokens=True,
+            truncation=True,
+            max_length=self.max_target_length,
+        )
+
+        # print(target_ids)
+        tokenized_sample = {}
+        tokenized_sample["input_ids"] = np.array(tokenized, dtype=np.int32)
+        tokenized_sample["attention_mask"] = np.ones(len(tokenized), dtype=np.int8)
+        tokenized_sample["labels"] = np.array(target_ids, dtype=np.int32)
+        tokenized_sample["idx"] = sample["idx"]
+        # print(tokenized_sample)
+        return tokenized_sample
+
+
+@dataclass
+class TextGenCollator:
+    '''
+    '''
+    config: None
+    pad_token_id: -100
+    decoder_start_token_id: 0
+    formator: str = 't5style'
+
+    def setup(self):
+        pass
+
+    def __call__(self, samples):
+        batch = {
+            k: [
+                torch.tensor(samples[i][k], dtype=torch.int64)
+                for i in range(len(samples))
+            ]
+            for k in ["input_ids", "attention_mask", "labels"]
+        }
+        batch["idx"] = torch.tensor([samples[i]["idx"] for i in range(len(samples))])
+
+        # print(batch)
+        for k, v in batch.items():
+            if k != "labels" and k != "idx":
+                batch[k] = pad_sequence(
+                    v, batch_first=True, padding_value=self.pad_token_id
+                )
+            elif k == "labels":
+                batch[k] = pad_sequence(v, batch_first=True, padding_value=-100)
+
+        batch["decoder_input_ids"] = torch.tensor(
+            self.shift_tokens_right(
+                batch["labels"], self.pad_token_id, self.decoder_start_token_id
+            ),
+            dtype=torch.long,
+        )
+        return batch
+
+    def shift_tokens_right(
+        self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int
+    ) -> np.ndarray:
+        """
+        Shift input ids one token to the right.
+        """
+        shifted_input_ids = np.zeros_like(input_ids)
+        shifted_input_ids[:, 1:] = input_ids[:, :-1]
+        shifted_input_ids[:, 0] = decoder_start_token_id
+
+        shifted_input_ids = np.where(
+            shifted_input_ids == -100, pad_token_id, shifted_input_ids
+        )
+        return shifted_input_ids
+
+
+if __name__ == "__main__":
+    # test
+    import argparse
+
+    total_parser = argparse.ArgumentParser("DATASET parser")
+    total_parser.add_argument(
+        "--tokenizer_type",
+        default="t5_tokenizer",
+        choices=["bert_tokenizer", "t5_tokenizer"],
+    )
+    total_parser.add_argument("--preprocessing_num_workers", default="4", type=int)
+    total_parser.add_argument(
+        "--new_vocab_path",
+        default=None,
+        type=str,
+    )
+
+    total_parser.add_argument(
+        "--pretrained_model_path",
+        default="YOUR DOWNLOAD MODEL PATH",
+    )
+    total_parser.add_argument("--train_split_size", default=0.995, type=int)
+    total_parser.add_argument(
+        "--formator", default="t5style", choices=["t5style", "squad", "dialog"]
+    )
+    total_parser = TextGenCollator.add_data_specific_args(total_parser)
+    args = total_parser.parse_args()
+    args.train_data_path = "cmrc"
+    ds = T5StyleDataset("cmrc", args, "dev")
+    print(len(ds))
+    for i in range(10):
+        print(ds[i])
+
+    dl = TextGenCollator(args)
+    for i in range(5):
+        for batch in dl.val_dataloader():
+            print(batch)
+            print(batch["input_ids"])
+            print(batch["no_answer"])
+            print(batch["decoder_input_ids"])
+            print(batch["labels"])
diff --git a/fengshen/examples/qa_t5/run_finetune.sh b/fengshen/examples/qa_t5/run_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e8e1f4b0fe07a8d2807e44d55a1f22cb2ef6439
--- /dev/null
+++ b/fengshen/examples/qa_t5/run_finetune.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+#SBATCH --job-name=finetune-cmrc
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:1               # number of gpus
+#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o $YOUR_PROJECT_DIR/%x-%j.log
+#SBATCH -e $YOUR_PROJECT_DIR/%x-%j.err
+
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=8
+
+ROOT_DIR=$YOUR_PROJECT_DIR
+DOWNLOAD_MODEL_PATH=$YOUR_PROJECT_DIR/Randeng-T5-784M-QA-Chinese/
+
+
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.randeng_t5_dialog_784M.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=$YOUR_HOME/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 10 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --save_ckpt_path $ROOT_DIR/ckpt \
+    --save_top_k 5 \
+    --every_n_train_steps 100\
+    --monitor val_rougeL_fmeasure \
+    --mode max \
+    --save_last \
+    --check_val_every_n_epoch 1 \
+    --num_workers 4 \
+    --dataloader_workers 4 \
+    --replace_sampler_ddp False \
+    --accumulate_grad_batches 2 \
+    --formator t5style \
+    --filename model-{epoch:02d}-{val_loss:.4f}-{val_rougeL_fmeasure:.3f} \
+    --precision 16 \
+"
+
+TRAIN_DATA_PATH=$YOUR_TRAIN_FILE
+DEV_DATA_PATH=$YOUR_DEV_FILE
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --train_file $TRAIN_DATA_PATH \
+    --val_file $DEV_DATA_PATH \
+    --max_seq_length 512 \
+    --max_knowledge_length 425 \
+    --max_target_length 128
+"
+
+MODEL_ARGS="
+    --pretrained_model_path $DOWNLOAD_MODEL_PATH \
+    --tokenizer_type t5_tokenizer \
+    --learning_rate 1e-4 \
+    --weight_decay 1e-2 \
+    --warmup_ratio 0.1 \
+    --sheduler_type polynomial \
+    --min_learning_rate 1e-5 \
+"
+
+SCRIPTS_PATH=$YOUR_PROJECT_DIR/Fengshenbang-LM/fengshen/examples/qa_t5/finetune_t5_cmrc.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+# conda activate fs
+# export CUDA_VISIBLE_DEVICES=5
+srun python $CMD
diff --git a/fengshen/examples/qa_t5/run_predict.sh b/fengshen/examples/qa_t5/run_predict.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8b8470ed1136320b75ba6da51209b3c9af9c74d0
--- /dev/null
+++ b/fengshen/examples/qa_t5/run_predict.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+#SBATCH --job-name=predict-cmrc
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:1               # number of gpus
+#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH -o $YOUR_SLURM_LOG_PATH/%x-%j.log
+#SBATCH -e $YOUR_SLURM_LOG_PATH/%x-%j.err
+
+#
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=8
+
+ROOT_DIR=$YOUR_PROJECT_DIR
+DOWNLOAD_MODEL_PATH=$YOUR_PROJECT_DIR/Randeng-T5-784M-QA-Chinese/
+#YOUR_MODEL_DIR
+
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="$ROOT_DIR/ds_config.randeng_t5_dialog_784M.$SLURM_JOBID.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=$YOUR_HOME/tmp/torch_extendsions
+# strategy=ddp
+strategy=deepspeed_stage_1
+
+TRAINER_ARGS="
+    --max_epochs 10 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy ${strategy} \
+    --default_root_dir $ROOT_DIR \
+    --save_ckpt_path $ROOT_DIR/ckpt \
+    --save_top_k 5 \
+    --every_n_train_steps 100\
+    --monitor val_rougeL_fmeasure \
+    --mode max \
+    --save_last \
+    --check_val_every_n_epoch 1 \
+    --num_workers 4 \
+    --dataloader_workers 4 \
+    --replace_sampler_ddp False \
+    --accumulate_grad_batches 2 \
+    --formator t5style \
+    --filename model-{epoch:02d}-{val_loss:.4f}-{val_rougeL_fmeasure:.3f} \
+    --do_eval_only \
+    --prediction_res_path $ROOT_DIR/predictions_sampling.txt \
+    --decode_strategy sampling \
+    --precision 16 \
+"
+
+TEST_FILE_PATH=$YOUR_DATA_FILE
+
+DATA_ARGS="
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_file $TEST_FILE_PATH \
+    --max_seq_length 512 \
+    --max_knowledge_length 425 \
+    --max_target_length 128
+"
+MODEL_ARGS="
+    --pretrained_model_path $DOWNLOAD_MODEL_PATH\
+    --tokenizer_type t5_tokenizer \
+    --learning_rate 1e-4 \
+    --weight_decay 1e-2 \
+    --warmup_ratio 0.1 \
+    --sheduler_type polynomial \
+    --min_learning_rate 1e-5 \
+"
+
+SCRIPTS_PATH=$YOUR_PROJECT_DIR/Fengshenbang-LM/fengshen/examples/qa_t5/finetune_t5_cmrc.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+# conda activate fs
+# export CUDA_VISIBLE_DEVICES=5
+srun python $CMD
diff --git a/fengshen/examples/randeng_reasoning/README.md b/fengshen/examples/randeng_reasoning/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7ccc3df3d5c3fe50ebd52f1ddc8a822e13e6528
--- /dev/null
+++ b/fengshen/examples/randeng_reasoning/README.md
@@ -0,0 +1,161 @@
+# 燃灯系列-因果推理生成模型
+
+- Huggingface: 
+    - [Randeng-TransformerXL-5B-Deduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese)
+    - [Randeng-TransformerXL-5B-Abduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese)
+- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM/fengshen/examples/randeng_reasoning)
+- Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/)
+- Demo: [Reasoning Tree](https://idea.edu.cn/ccnl-act/reasoning/)
+
+## 简介 Brief Introduction
+
+基于Transformer-XL的中文因果推理生成模型和反绎推理生成模型。
+
+Chinese deductive reasoning model and abductive reasoning model based on Transformer-XL.
+
+## 模型分类 Model Taxonomy
+
+|  需求 Demand  | 任务 Task       | 系列 Series      | 模型 Model    | 参数 Parameter | 额外 Extra |
+|  :----:  | :----:  | :----:  | :----:  | :----:  | :----:  |
+| 通用 General | 自然语言生成 NLG | 燃灯 Randeng | TransformerXL |      5.0B      |     中文-因果推理 Chinese-Reasoning    |
+
+## 模型信息 Model Information
+
+**数据准备 Corpus Preparation**
+
+* 悟道语料库（280G版本）
+* 因果语料库（2.3M个样本）：基于悟道语料库（280G版本），通过关联词匹配、人工标注 + [GTSFactory](https://gtsfactory.com/)筛选、数据清洗等步骤获取的具有因果关系的句子对
+
+* Wudao Corpus (with 280G samples) 
+* Wudao Causal Corpus (with 2.3 million samples): Based on the Wudao corpus (280G version), sentence pairs with causality were obtained through logic indicator matching, manual annotation + [GTSFactory](https://gtsfactory.com/), and data cleaning.
+
+**训练流程 Model Training**
+1. 在悟道语料库（280G版本）上进行预训练
+2. 在1.5M因果语料上分别进行因果生成任务和反绎生成任务的训练
+3. 基于其余0.8M因果语料，[Randeng-TransformerXL-5B-Deduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese)、[Randeng-TransformerXL-5B-Abduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese)和[Erlangshen-Roberta-330M-Causal-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Roberta-330M-Causal-Chinese)进行Self-consistent闭环迭代训练
+    * 两个生成模型基于核采样和贪心的方式进行因果推理和反绎推理，产生大量伪样本；
+    * Erlangshen-Roberta-330M-Causal-Chinese模型对伪样本句子对的因果关系进行打分，筛选供自身以及生成模型训练的样本
+
+First, the Transformer-XL model was pre-trained on the Wudao Corpus (with 280G samples) and annotated similar-sentence pair dataset (same as [Randeng-TransformerXL-1.1B-Paraphrasing-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-1.1B-Paraphrasing-Chinese)).
+Then, the model was trained on our causal corpus (about 1.5 million samples) for the deductive reasoning task.
+At last, based on the remaining 0.8 million samples of the causal corpus, we conducted self-consistent learning on [Randeng-TransformerXL-5B-Deduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese) and [Randeng-TransformerXL-5B-Abduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese), cooperating with [Erlangshen-Roberta-330M-Causal-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Roberta-330M-Causal-Chinese).
+Specifically, two generative models performed deductive reasoning and abductive reasoning based on each sample respectively, generating a large number of pseudo-samples; [Erlangshen-Roberta-330M-Causal-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Roberta-330M-Causal-Chinese) scored the causality of the pseudo-samples and selected the training data for itself and the generative models in the next iteration.
+
+## 加载模型 Loading Models
+
+```shell
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+```
+
+```python 
+from fengshen.models.transfo_xl_reasoning import TransfoXLModel
+from transformers import T5Tokenizer as TransfoXLTokenizer
+deduction_model = TransfoXLModel.from_pretrained('IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese')
+abduction_model = TransfoXLModel.from_pretrained('IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese')
+tokenizer = TransfoXLTokenizer.from_pretrained(
+    "IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese",
+    eos_token='<|endoftext|>',
+    pad_token='<|endoftext|>',
+    extra_ids=0
+)
+tokenizer.add_special_tokens({'bos_token': '<bos>'})
+```
+
+## 使用示例 Usage Example
+
+```python 
+from fengshen.models.transfo_xl_reasoning import deduction_generate, abduction_generate
+input_text = "机器人统治世界"
+input_texts = ["机器人统治世界", "玉米价格持续上涨"]
+print(deduction_generate(deduction_model, tokenizer, input_text, device=0))
+print(deduction_generate(deduction_model, tokenizer, input_texts, device=0))
+print(abduction_generate(abduction_model, tokenizer, input_text, device=0))
+print(abduction_generate(abduction_model, tokenizer, input_texts, device=0))
+```
+
+## 一些例子 Some Cases
+
+（以下内容为模型生成，不代表本单位观点）
+
+**因果推理 Deduction**
+1. 鲸鱼是哺乳动物，哺乳动物是恒温动物
+    - 鲸鱼拥有极强的恒温能力
+    - 鲸鱼需要消耗大量的能量
+    - 鲸鱼能在海洋中生存下来，完全依靠其体温恒定
+2. 鲸鱼是哺乳动物，哺乳动物不是恒温动物
+    - 鲸鱼不是恒温动物，鲸鱼是变温动物
+    - 鲸鱼会随温度变化而变化
+    - 鲸鱼有冬眠现象
+3. 机器人统治世界
+    - 机器人取代人类做大部分工作
+    - 机器人世界的法律必须能够适用于机器人统治的现实世界
+    - 机器人必须能够相互沟通，并能够处理人类无法处理的复杂情况
+    - 未来将不再需要大量工人，机器人将接管工厂
+4. 玉米价格持续上涨
+    - 玉米淀粉价格也呈现上涨趋势
+    - 玉米种植效益不断攀升
+    - 在玉米深加工行业引起了一阵骚动
+5. 实体经济融资难、融资贵
+    - 急需发展互联网金融等金融业态，为实体经济提供融资服务
+    - 融资需求向金融资产转移，增加了金融资产供给
+    - 必须大力发展资本市场，使资本市场成为经济转型的助推器
+6. 影响华北地区的冷空气势力偏弱
+    - 冷空气的影响时间将偏短
+    - 冷空气影响结束后，华北地区气温会继续缓慢回升
+    - 华北地区气温较常年同期偏高
+
+**反绎推理 Abduction**
+1. 玉米价格持续上涨
+    - 玉米库存较低，需求增加
+    - 东北地区受降雨天气影响，玉米生长受到影响
+    - 今年玉米种植面积大幅度下降
+2. 玉米价格下跌
+    - 玉米的库存量大，需求量低
+    - 今年玉米产量创新高，而需求不足
+    - 目前玉米市场处于供大于求的状态，再加上近期华北地区遭遇了强降雨天气，玉米质量下降
+3. 农作物大量死亡
+    - 旱灾持续时间长，又无雨，土壤干裂，作物得不到水分
+    - 霜冻来临，气温骤降，植物受冻
+    - 许多农民为了使农作物能够长得更好，使用更多的农药，并且没有合理的休耕措施
+4. 鲸鱼需要消耗大量的能量
+    - 鲸鱼的体型庞大，新陈代谢速度又快
+    - 鲸鱼的身体结构特殊，需要消耗大量的能量来维持身体结构的稳定
+5. 实体经济融资难、融资贵
+    - 融资渠道单一，实体经济难以获得充足的资金
+    - 实体经济融资主要依赖抵押、担保、信贷等间接融资方式，存在抵押物不足、担保机制不完善等问题
+    - 实体经济往往需要大量的资金，而银行受制于风险控制、资本充足率等要求，很难大量发放贷款
+6. 火山爆发导致植物死亡
+    - 火山灰会阻碍植物吸收阳光
+    - 火山灰的飘散，导致植物无法吸收到足够的氧气
+    - 火山喷发时，岩浆温度极高，植物无法承受
+
+
+## 引用 Citation
+
+如果您在您的工作中使用了我们的模型，可以引用我们的[论文](https://arxiv.org/abs/2209.02970)：
+
+If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970):
+
+```text
+@article{fengshenbang,
+  author    = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang},
+  title     = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence},
+  journal   = {CoRR},
+  volume    = {abs/2209.02970},
+  year      = {2022}
+}
+```
+
+也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+```text
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2021},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
\ No newline at end of file
diff --git a/fengshen/examples/sequence_tagging/finetune_sequence_tagging.py b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ca513231810e3c7020e1ee4657c53ce286a5e7
--- /dev/null
+++ b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.py
@@ -0,0 +1,317 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+import copy
+import logging
+import torch.nn.functional as F
+import os
+import json
+import torch
+import pytorch_lightning as pl
+import argparse
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data._utils.collate import default_collate
+from fengshen.models.tagging_models.bert_for_tagging import BertLinear,BertCrf,BertSpan,BertBiaffine
+from fengshen.data.sequence_tagging_dataloader.sequence_tagging_collator import CollatorForLinear, CollatorForCrf, CollatorForSpan, CollatorForBiaffine
+from fengshen.data.sequence_tagging_dataloader.sequence_tagging_datasets import DataProcessor, get_datasets
+from fengshen.metric.metric import EntityScore
+from fengshen.models.model_utils import configure_optimizers, get_total_steps
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.data.universal_datamodule import UniversalDataModule
+
+from transformers import (
+    BertTokenizer, BertConfig, AutoTokenizer
+)
+from fengshen.metric.utils_ner import get_entities, bert_extract_item
+
+
+_model_dict={
+    'bert-linear': BertLinear,
+    'bert-crf': BertCrf,
+    'bert-span': BertSpan,
+    'bert-biaffine': BertBiaffine
+}
+
+_collator_dict={
+    'linear': CollatorForLinear,
+    'crf': CollatorForCrf,
+    'span': CollatorForSpan
+}
+
+_validation_dict={
+    'linear': 'validation_linear',
+    'crf': 'validation_crf',
+    'span': 'validation_span',
+    'biaffine': 'validation_biaffine',
+}
+
+_prediction_dict={
+    'linear': 'predict_linear',
+    'crf': 'predict_crf',
+    'span': 'predict_span',
+    'biaffine': 'predict_biaffine',
+}
+
+logger = logging.getLogger(__name__)
+
+
+class LitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument("--max_seq_length", default=512, type=int)
+        parser.add_argument('--data_dir', default=None, type=str)
+        parser.add_argument('--model_type', default='bert', type=str)
+        parser.add_argument("--decode_type", default="linear", choices=["linear", "crf", "biaffine", "span"], type=str)
+        parser.add_argument('--loss_type', default='ce', type=str, choices=['lsr', 'focal', 'ce'])
+        return parent_args
+
+    def __init__(self, args, id2label, tokenizer):
+        super().__init__()
+
+        self.model_name=args.model_type+"-"+args.decode_type
+        self.id2label = id2label
+        
+        self.config=BertConfig.from_pretrained(args.model_path)
+        self.tokenizer = tokenizer
+        self.model = _model_dict[self.model_name].from_pretrained(args.model_path, config=self.config, num_labels=len(self.id2label), loss_type=args.loss_type)
+        self.entity_score=EntityScore()
+
+        self.validate_fn=getattr(self,_validation_dict[args.decode_type])
+        self.predict_fn=getattr(self,_prediction_dict[args.decode_type])
+
+        self.predict_result=[]
+
+        self.save_hyperparameters(args)
+        
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        self.log('train_loss', loss)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        self.validate_fn(batch,batch_idx)
+
+    def validation_linear(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.logits
+
+        preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
+        preds = preds.detach().cpu().numpy()
+        labels = batch['labels'].detach().cpu().numpy()
+
+        for i, label in enumerate(labels):
+            y_true = []
+            y_pred = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif j == (torch.sum(batch['attention_mask'][i]).item()-1):
+                    true_subject=get_entities(y_true,self.id2label)
+                    pred_subject=get_entities(y_pred,self.id2label)
+                    self.entity_score.update(true_subject=true_subject, pred_subject=pred_subject)
+                    break
+                else:
+                    y_true.append(self.id2label[labels[i][j]])
+                    y_pred.append(self.id2label[preds[i][j]])
+        
+        self.log('val_loss', loss)
+
+    def validation_crf(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.logits
+
+        preds = self.model.crf.decode(logits, batch['attention_mask'])
+        preds = preds.detach().squeeze(0).cpu().numpy().tolist()
+        labels = batch['labels'].detach().cpu().numpy()
+
+        for i, label in enumerate(labels):
+            y_true = []
+            y_pred = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif j == (torch.sum(batch['attention_mask'][i]).item()-1):
+                    true_subject=get_entities(y_true,self.id2label)
+                    pred_subject=get_entities(y_pred,self.id2label)
+                    self.entity_score.update(true_subject=true_subject, pred_subject=pred_subject)
+                    break
+                else:
+                    y_true.append(self.id2label[labels[i][j]])
+                    y_pred.append(self.id2label[preds[i][j]])
+
+        self.log('val_loss', loss)
+
+    def validation_span(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        start_logits = outputs.start_logits
+        end_logits = outputs.end_logits
+        labels=batch['subjects']
+        for i, T in enumerate(labels):
+            active_start_logits=start_logits[i][:batch['input_len'][i]]
+            active_end_logits=end_logits[i][:batch['input_len'][i]]
+            R = bert_extract_item(active_start_logits, active_end_logits)
+
+            T=T[~torch.all(T==-1,dim=-1)].cpu().numpy()
+            T=list(map(lambda x:(self.id2label[x[0]],x[1],x[2]),T))
+            R=list(map(lambda x:(self.id2label[x[0]],x[1],x[2]),R))
+
+            self.entity_score.update(true_subject=T, pred_subject=R)
+        self.log('val_loss', loss)
+
+    def validation_biaffine(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.span_logits
+
+        preds = torch.argmax(logits.cpu().numpy(), axis=-1)
+        labels = batch['span_labels'].cpu().numpy()
+
+        for i, label in enumerate(labels):
+            input_len=(batch['input_len'][i])-2
+            active_label=labels[i,1:input_len+1,1:input_len+1]
+            active_pred=preds[i,1:input_len+1,1:input_len+1]
+
+            temp_1 = []
+            temp_2 = []
+
+            for j in range(input_len):
+                for k in range(input_len):
+                    if self.id2label[active_label[j,k]]!="O":
+                        temp_1.append([self.id2label[active_label[j,k]],j,k])
+                    if self.id2label[active_pred[j,k]]!="O":
+                        temp_2.append([self.id2label[active_pred[j,k]],j,k])
+
+            self.entity_score.update(pred_subject=temp_2, true_subject=temp_1)
+
+        self.log('val_loss', loss)
+    
+    def validation_epoch_end(self, outputs):
+        # compute metric for all process
+        score_dict, _ = self.entity_score.result()
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            print('score_dict:\n', score_dict)
+        # reset the metric after once validation
+        self.entity_score.reset()
+        for k, v in score_dict.items():
+            self.log('val_{}'.format(k), v)
+
+    def predict_step(self, batch, batch_idx):
+        batch['labels'] = None
+        outputs = self.model(**batch)
+
+        self.predict_fn(batch,batch_idx)
+
+    def predict_linear(self, batch, outputs):
+        logits = torch.argmax(F.log_softmax(outputs.logits, dim=2), dim=2)
+        preds = logits.detach().cpu().numpy()
+
+        for i, pred in enumerate(preds):
+            text = self.tokenizer.convert_ids_to_tokens(batch['input_ids'][i])[:batch['input_len'][i]][1:-1]
+            pred = pred[:batch['input_len'][i]][1:-1]
+            label_entities = get_entities(pred, self.id2label)
+            for label_list in label_entities:
+                label_list.append("".join(text[label_list[1]:label_list[2]+1]))
+
+            self.predict_result.extend(label_entities)
+
+    def predict_crf(self, batch, batch_idx):
+        logits = self.model(**batch).logits
+        preds = self.model.crf.decode(logits, batch['attention_mask']).squeeze(0).cpu().numpy().tolist()
+
+        for i, pred in enumerate(preds):
+            text = self.tokenizer.convert_ids_to_tokens(batch['input_ids'][i])[:batch['input_len'][i]][1:-1]
+            pred = pred[:batch['input_len'][i]][1:-1]
+            label_entities = get_entities(pred, self.id2label)
+            for label_list in label_entities:
+                label_list.append("".join(text[label_list[1]:label_list[2]+1]))
+        
+            self.predict_result.extend(label_entities)
+
+    def predict_span(self, batch, batch_idx):
+        batch['start_positions'] = None
+        batch['end_positions'] = None
+        outputs = self.model(**batch)
+
+        start_logits, end_logits = outputs.start_logits, outputs.end_logits
+        for i, _ in enumerate(start_logits):
+            text = self.tokenizer.convert_ids_to_tokens(batch['input_ids'][i])[:batch['input_len'][i]][1:-1]
+            R = bert_extract_item(start_logits[i][:batch['input_len'][i]], end_logits[i][:batch['input_len'][i]])
+            if R:
+                label_entities = [[self.id2label[x[0]],x[1],x[2],"".join(text[x[1]:x[2]+1])] for x in R]
+            else:
+                label_entities = []
+            
+            self.predict_result.extend(label_entities)
+
+    
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+
+    # * Args for data preprocessing
+    total_parser = UniversalDataModule.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = pl.Trainer.add_argparse_args(total_parser)
+    total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+
+    # * Args for base model
+    from fengshen.models.model_utils import add_module_args
+    total_parser = add_module_args(total_parser)
+    total_parser = LitModel.add_model_specific_args(total_parser)
+
+    args = total_parser.parse_args()
+
+    datasets=get_datasets(args)
+
+    checkpoint_callback = UniversalCheckpoint(args).callbacks
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    
+    trainer = pl.Trainer.from_argparse_args(args,
+                                            callbacks=[checkpoint_callback, lr_monitor]
+                                            )
+
+    label2id,id2label=DataProcessor.get_labels(args)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+
+    collator = _collator_dict[args.decode_type]()
+    collator.args=args
+    collator.tokenizer=tokenizer
+    collator.label2id=label2id                                        
+    data_model = UniversalDataModule(tokenizer,collator,args,datasets)
+
+    model = LitModel(args,id2label,tokenizer)
+    print(label2id)
+    trainer.fit(model, data_model)
+    # trainer.predict(model,dataloaders=data_model.predict_dataloader())
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/fengshen/examples/sequence_tagging/finetune_sequence_tagging.sh b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a477ed89852a4ec96139e85d7e44ed476aaeab76
--- /dev/null
+++ b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_cmeee # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/lujunyu/experiments/ner_finetune/zen2_base_cmeee/%x-%j.log # output and error file name (%x=job name, %j=job id)
+#SBATCH -p hgx
+
+
+ROOT_DIR=../../workspace
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=ner_bert_base
+TASK=cmeee
+
+MODEL_NAME=bert-base
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=16
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+DATA_ARGS="\
+        --num_workers 8 \
+        --dataloader_workers 8 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        "
+
+MODEL_ARGS="\
+        --model_path $MODEL_ROOT_DIR/pretrain \
+        --data_dir /cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo \
+        --model_type bert \
+        --decode_type linear \
+        --learning_rate 5e-5 \
+        --weight_decay 0.05 \
+        --warmup_ratio 0.1 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --save_top_k -1 \
+        --save_last \
+        --every_n_train_steps 100 \
+        --save_ckpt_path ${MODEL_ROOT_DIR} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --check_val_every_n_epoch 1 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        "
+
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+
+python3 finetune_sequence_tagging.py $options
+
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/stable_diffusion_chinese/README.md b/fengshen/examples/stable_diffusion_chinese/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..edfb1d354bb2f98c1d24841ae86a70f07ceee37a
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese/README.md
@@ -0,0 +1,99 @@
+# Taiyi-Stable-Diffusion-1B-Chinese-v0.1
+
+- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM)
+- Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/)
+
+## 简介 Brief Introduction
+
+首个开源的中文Stable Diffusion模型，基于0.2亿筛选过的中文图文对训练。
+
+The first open source Chinese Stable diffusion, which was trained on 20M filtered Chinese image-text pairs.
+
+## 模型分类 Model Taxonomy
+
+|  需求 Demand  | 任务 Task       | 系列 Series      | 模型 Model    | 参数 Parameter | 额外 Extra |
+|  :----:  | :----:  | :----:  | :----:  | :----:  | :----:  |
+| 特殊 Special | 多模态 Multimodal | 太乙 Taiyi | Stable Diffusion |    1B    |     Chinese     |
+
+## 模型信息 Model Information
+
+我们将[Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)数据集(100M)和[Zero](https://zero.so.com/)数据集(23M)用作预训练的数据集，先用[IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)对这两个数据集的图文对相似性进行打分，取CLIP Score大于0.2的图文对作为我们的训练集。 我们使用[IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)作为初始化的text encoder，冻住[stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([论文](https://arxiv.org/abs/2112.10752))模型的其他部分，只训练text encoder，以便保留原始模型的生成能力且实现中文概念的对齐。该模型目前在0.2亿图文对上训练了一个epoch。 我们在 32 x A100 训练了大约100小时。该版本只是一个初步的版本，我们将持续优化并开源后续模型，欢迎交流。
+
+We use [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)(100M) 和 [Zero](https://zero.so.com/)(23M) as our dataset, and take the image and text pairs with CLIP Score (based on [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)) greater than 0.2 as our Training set. We use [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) as our init text encoder. To keep the powerful generative capability of stable diffusion and align Chinese concepts with the images, We only train the text encoder and freeze other part of the [stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([paper](https://arxiv.org/abs/2112.10752)) model. It takes 100 hours to train this model based on 32 x A100. This model is a preliminary version and we will update this model continuously and open sourse. Welcome to exchange！
+
+### Result
+Basic Prompt
+
+|  铁马冰河入梦来，3D绘画。   |  飞流直下三千尺，油画。 | 女孩背影，日落，唯美插画。  |
+|  ----  | ----  | ----  |
+| ![](result_examples/tiema.png)  | ![](result_examples/feiliu.png)  | ![](result_examples/nvhai.jpg) |
+
+Advanced Prompt
+
+| 铁马冰河入梦来，概念画，科幻，玄幻，3D  | 中国海边城市，科幻，未来感，唯美，插画。 | 那人却在灯火阑珊处，色彩艳丽，古风，资深插画师作品，桌面高清壁纸。 |
+|  ----  | ----  | ----  |
+| ![](result_examples/tiema2.jpg)  | ![](result_examples/chengshi.jpg) | ![](result_examples/naren.jpg) |
+
+
+## 使用 Usage
+
+### 全精度 Full precision
+
+```py
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1").to("cuda")
+
+prompt = '飞流直下三千尺，油画'
+image = pipe(prompt, guidance_scale=7.5).images[0]  
+image.save("飞流.png")
+```
+
+### 半精度 Half precision FP16 (CUDA)
+
+添加 `torch_dtype=torch.float16` 和 `device_map="auto"` 可以快速加载 FP16 的权重，以加快推理速度。
+更多信息见 [the optimization docs](https://huggingface.co./docs/diffusers/main/en/optimization/fp16#half-precision-weights)。
+
+```py
+# !pip install git+https://github.com/huggingface/accelerate
+import torch
+from diffusers import StableDiffusionPipeline
+torch.backends.cudnn.benchmark = True
+pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1", torch_dtype=torch.float16)
+pipe.to('cuda')
+
+prompt = '飞流直下三千尺，油画'
+image = pipe(prompt, guidance_scale=7.5).images[0]  
+image.save("飞流.png")
+```
+
+
+## 引用 Citation
+
+如果您在您的工作中使用了我们的模型，可以引用我们的[总论文](https://arxiv.org/abs/2209.02970)：
+
+If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970):
+
+```text
+@article{fengshenbang,
+  author    = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang},
+  title     = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence},
+  journal   = {CoRR},
+  volume    = {abs/2209.02970},
+  year      = {2022}
+}
+```
+
+也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+```text
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2021},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
+
diff --git a/fengshen/examples/stable_diffusion_chinese/img/hf_stable_blog.png b/fengshen/examples/stable_diffusion_chinese/img/hf_stable_blog.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d8e3e97a5a2c8d324a92d5e0e26efea324c46de
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/img/hf_stable_blog.png differ
diff --git a/fengshen/examples/stable_diffusion_chinese/img/seed.png b/fengshen/examples/stable_diffusion_chinese/img/seed.png
new file mode 100644
index 0000000000000000000000000000000000000000..8d82a8128a65a626f4c48867f6e99540e4970d0d
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/img/seed.png differ
diff --git a/fengshen/examples/stable_diffusion_chinese/img/test.md b/fengshen/examples/stable_diffusion_chinese/img/test.md
new file mode 100644
index 0000000000000000000000000000000000000000..c8b1b42336f7e2c26898b5b99441d324f2de5412
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese/img/test.md
@@ -0,0 +1 @@
+delete
diff --git a/fengshen/examples/stable_diffusion_chinese/img/ui.png b/fengshen/examples/stable_diffusion_chinese/img/ui.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c34d14cb0fc9379f67f865abd3f3ec33c46bc1b
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/img/ui.png differ
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212.png"
new file mode 100644
index 0000000000000000000000000000000000000000..996adacb1736d516defaa177a71c4545cf738df1
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef1d12d9fae3580549b54aebdb8454fb12e5e2dd7f7c61540a0149d40d071998
+size 2581243
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267.png"
new file mode 100644
index 0000000000000000000000000000000000000000..65a5ca7d0e4bc626064476c687378ba2ca23ffce
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5226485d0ff3dd1b5d0a7c0b63e7b907aa221b364e84b137ce493ea35ea3d18b
+size 2515750
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267\350\257\215\346\261\207.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267\350\257\215\346\261\207.png"
new file mode 100644
index 0000000000000000000000000000000000000000..066a6106a60ed369891714ef2b68cd1f8f1d9e41
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267\350\257\215\346\261\207.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b8580dd7b305d7c15ae1337da6a88d9f8a05486a822aa13858eff7076d37e72
+size 2549987
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\345\217\245\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\345\217\245\345\217\267.png"
new file mode 100644
index 0000000000000000000000000000000000000000..3256b5982218860cd70bae3a5f851752ecebc20c
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\345\217\245\345\217\267.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3312a017440c8a9f2175d9bdc66a55888ebd75ef79709b664e12e290bc86edf4
+size 2494122
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\346\204\237\345\217\271\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\346\204\237\345\217\271\345\217\267.png"
new file mode 100644
index 0000000000000000000000000000000000000000..e0b5607e01d9734fc08f639bdae4ae2c95d8c2fc
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\346\204\237\345\217\271\345\217\267.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80edee518f20651a7e21196cbd5cb65422c18e60586782f147a0a8f6305cc2a3
+size 2508439
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267.png"
new file mode 100644
index 0000000000000000000000000000000000000000..d0255d44e71ec407dd3e3044631f6c7b53de3744
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:911a5aeeeab3cf06387b6ada29afd40eea56a468dfda6fc71ce75273339dd84b
+size 2480902
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267\346\240\207\350\256\260.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267\346\240\207\350\256\260.png"
new file mode 100644
index 0000000000000000000000000000000000000000..720f33bd662470181aaa8fa90adefe0af84dfafd
Binary files /dev/null and "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267\346\240\207\350\256\260.png" differ
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267.png"
new file mode 100644
index 0000000000000000000000000000000000000000..75f465dcf29526f60fb60036c9299f6636bf9be0
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbf28c2b24f38ceec6957b524d120f6e4ba517db73f8bab46d85cdd26bbcdce8
+size 2609499
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270.png"
new file mode 100644
index 0000000000000000000000000000000000000000..823973acc84f432e07965e4bde318e2d25322923
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:820341bdc8061c15bc1418f2e576d4b1b19212398d3a5f554dc759f23564f575
+size 2700124
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270256.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270256.png"
new file mode 100644
index 0000000000000000000000000000000000000000..24e554d0d9c642867fab2c2884f37a17ce0bcb96
Binary files /dev/null and "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270256.png" differ
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270384.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270384.png"
new file mode 100644
index 0000000000000000000000000000000000000000..2e6f2c6269b0b4ce79c57b8c9bcb4b54685230db
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270384.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03b6ea049e5788fee0145d06cfb4056fed061a68a956a37361cb19ceed434a23
+size 1540886
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\345\244\215\346\235\202.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\345\244\215\346\235\202.png"
new file mode 100644
index 0000000000000000000000000000000000000000..ecea935526ade3aa7ffdbdda31fe8b2ee022f3bf
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\345\244\215\346\235\202.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:479ef4a81477c11f282ce791fd569d28d4b8c1016fae768e30334e85cbb2e7c0
+size 2809505
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\347\262\276\347\273\206.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\347\262\276\347\273\206.png"
new file mode 100644
index 0000000000000000000000000000000000000000..bab90e44e01e5b235f060acb3bb178245bea57ad
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\347\262\276\347\273\206.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e68da5d3b8cafd3d8339a8047eb2540d3a6827b8779877254cc1eabee8ffda4a
+size 2791607
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\351\253\230\346\270\205.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\351\253\230\346\270\205.png"
new file mode 100644
index 0000000000000000000000000000000000000000..9a746e8967b9e3bdc8d484d8b4bef1a2a6d30810
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\351\253\230\346\270\205.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dba3a4e18aacc50694454fe3475b3e407d94c35312b98cc3e84091ddae20464
+size 2746087
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\217\222\347\224\273.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\217\222\347\224\273.png"
new file mode 100644
index 0000000000000000000000000000000000000000..f129b44b34dd5f0be9b3622d7ab495add79d90d9
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\217\222\347\224\273.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dbf987b31c1b264dcebb1605357ad2bd3ee41ff953ab7a4f155554284254751
+size 2796841
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\260\264\345\275\251.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\260\264\345\275\251.png"
new file mode 100644
index 0000000000000000000000000000000000000000..481e4bf5d6db76427026d741b12d616c1b6c2c74
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\260\264\345\275\251.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49fd318fd749f205b87742743a19d3efb2999fb6c6cc95ebdabf971cb0be7b1d
+size 2810533
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\262\271\347\224\273.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\262\271\347\224\273.png"
new file mode 100644
index 0000000000000000000000000000000000000000..59f0c85aecfa7d389b4905ace2adb88f49c29b1b
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\262\271\347\224\273.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9425ab60933c4e237a8c86f1754fbe5de7e989638b2bce509db5dd87bfc4aad
+size 2941964
diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\347\264\240\346\217\217.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\347\264\240\346\217\217.png"
new file mode 100644
index 0000000000000000000000000000000000000000..0abc8554d99d18fe4aeb7e907cc8215994fb666b
--- /dev/null
+++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\347\264\240\346\217\217.png"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e507647da9b443096e33d65dbd389e0e8e94f2b96bbbadd84b997f22e53e608c
+size 2451943
diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/chengshi.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/chengshi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6c164a7715d3dad11fd992a65967a6be9ec129e9
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/chengshi.jpg differ
diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/feiliu.png b/fengshen/examples/stable_diffusion_chinese/result_examples/feiliu.png
new file mode 100644
index 0000000000000000000000000000000000000000..eef1ec05852d686ef4476ad70f78a456d814cbca
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/feiliu.png differ
diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/naren.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/naren.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..646e0e9b3c464669483a4f19d6f3c438b658979c
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/naren.jpg differ
diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/nvhai.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/nvhai.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cd032bc8ceab786bd5f9f51a58cc07e0ab4ce64d
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/nvhai.jpg differ
diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/tiema.png b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7c806c456c4cb08039da2d7ceffba776090e498
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema.png differ
diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/tiema2.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..de568bdb7ac108499d18a12c82aba98e51ddca70
Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema2.jpg differ
diff --git a/fengshen/examples/stable_diffusion_chinese/taiyi_handbook.md b/fengshen/examples/stable_diffusion_chinese/taiyi_handbook.md
new file mode 100644
index 0000000000000000000000000000000000000000..2849521e6ec23b8b116974bb601cdc400b1a216a
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese/taiyi_handbook.md
@@ -0,0 +1,425 @@
+# 太乙绘画使用手册1.0——AI人类助理入职指南
+
+版本：2022.11.20 (Ver 1)
+
+编撰团队：IDEA CCNL 封神榜团队  
+团队主页：https://github.com/IDEA-CCNL/Fengshenbang-LM 
+
+腾讯文档版本：太乙绘画使用手册1.0 https://docs.qq.com/doc/DWklwWkVvSFVwUE9Q
+
+感谢所有参与编撰以及投稿的“助理们”！（微信搜索：fengshenbang-lm）
+
+**特别感谢名单（排名按投稿时间顺序）：**
+王军杰，甘如饴，陈伟峰，李夏禹，高昕宇，
+
+<br /> 
+
+# 目录
+- [太乙绘画使用手册1.0——AI人类助理入职指南](#太乙绘画使用手册10ai人类助理入职指南)
+- [目录](#目录)
+- [前言](#前言)
+- [入门手册（如何写一个优秀的提示词）](#入门手册如何写一个优秀的提示词)
+  - [懒人简洁版](#懒人简洁版)
+  - [一些基础准备](#一些基础准备)
+  - [一个逗号引发的水印](#一个逗号引发的水印)
+  - [反向prompt negative](#反向prompt-negative)
+  - [赋予某种属性（4k壁纸, 插画, 油画等）消除白边](#赋予某种属性4k壁纸-插画-油画等消除白边)
+  - [增加细节](#增加细节)
+  - [画幅（512×512）](#画幅512512)
+- [引用](#引用)
+- [联系我们](#联系我们)
+- [版权许可](#版权许可)
+
+<br /> 
+
+# 前言
+
+本手册追求仅使用**自然语言**就可以生成**好看的**图片。
+
+这是一本**免费的、开源的**手册，我们乐意于**接受每个人的投稿**，一同完善本手册。
+
+本手册旨在提供一些关于中文文生图模型（太乙系列）的一些神奇的文本提示词，并且分享我们的一些神奇的发现（规则）。
+
+本手册包括两大部分：
+- 入门手册：提示词基础写法以及原理
+- 效果图册：一些我们觉得好看的图和对应的prompt
+
+本使用手册使用环境为：
+- 模型  
+https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1
+
+- 环境  
+WebUI  
+相关Github: https://github.com/IDEA-CCNL/Fengshenbang-LM/issues/186
+
+参考：https://docs.qq.com/doc/DWHl3am5Zb05QbGVs
+
+<br /> 
+
+# 入门手册（如何写一个优秀的提示词）
+
+![avatar](img/ui.png)
+
+<br />
+
+## 懒人简洁版
+___
+<br /> 
+
+提示词 Prompt：
+> 不能出现中文的标点符号，比如中文的逗号，中文句号。并且需要赋予这幅画某种属性。
+> 
+> 如：长河落日圆, 4k壁纸
+> 
+<br /> 
+
+反向提示词 Negative prompt：
+> 一些负面词汇
+> 
+> 通用反向提示词：广告, ，, ！, 。, ；, 资讯, 新闻, 水印
+
+<br /> 
+画幅大小设置为512×512最佳。
+
+
+<br />
+
+## 一些基础准备
+___
+<br /> 
+
+以下实验的随机种子均为：1419200315
+
+![avatar](img/ui.png)
+
+<br />
+
+## 一个逗号引发的水印
+___
+<br /> 
+
+我们来看看什么都不改会是咋样的。
+
+日出，海面上  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上中文逗号.png)
+
+<br />
+
+可以看到，其实是会出现水印，以及画幅不满的问题的。
+
+![avatar](img/日出，海面上中文逗号标记.png)
+
+<br />
+
+那我们把中文逗号换成英文逗号呢？
+
+日出, 海面上  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号.png)
+
+<br />
+
+！！！神奇的事情出现了，水印消失了！
+
+<br />
+
+会不会是标点符号的问题？所以我在上述是英文逗号的基础下，添加一个中文的句号作为结尾。
+
+![avatar](img/日出，海面上中文句号.png)
+
+没错，神奇的事情出现了，水印回来了，而且位置一模一样。
+
+<br />
+
+我甚至可以弄出更多的水印，比如加中文的感叹号。
+
+日出, 海面上！  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上中文感叹号.png)
+
+所以，一个重要的结论为，中文的标点符号是和水印有着某种强相关的联系的！
+
+因此，我们输入提示词时，应该**不用任何中文标点符号**。
+
+<br />
+
+## 反向prompt negative
+___
+<br /> 
+
+基本上就是把一些不好的词全加进去。
+
+我们的原图为：
+
+日出, 海面上  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号.png)
+
+<br />
+
+日出, 海面上  
+Negative prompt: 广告  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上nega广告.png)
+
+<br />
+
+加上了广告之后，画面的表现力要好一些，比如图5的山的轮廓更好了。
+
+根据之前的一些经验，把中文标点都放上去
+
+<br />
+
+日出, 海面上  
+Negative prompt: 广告, ，, ！, 。, ；  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上nega广告符号.png)
+
+<br />
+
+细节更多了点
+
+<br />
+
+日出, 海面上  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上nega广告符号词汇.png)
+
+<br />
+
+所以，我们的反向提示词选择： **广告, ，, ！, 。, ；, 资讯, 新闻, 水印**
+
+<br />
+
+## 赋予某种属性（4k壁纸, 插画, 油画等）消除白边
+___
+<br /> 
+
+我们的原图为：
+
+<br /> 
+
+日出, 海面上  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号.png)
+
+<br /> 
+
+我们添加了某种属性，比如 4k壁纸 之后：
+
+**4k壁纸**
+
+日出, 海面上, 4k壁纸  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸.png)
+
+<br /> 
+
+**interesting！图3的白边不见了！**
+
+<br /> 
+
+一个可能的解释是，我们的训练数据中，用的是resize的方法来调整输入的图片，而这样做，对于边长小于512的图，会自动保留白边。而这也就导致了我们的生成会有。但是一旦给这幅画赋予了某种属性，就可以避免这件事了。
+
+<br /> 
+
+（注，我试过3k壁纸和8k壁纸，都不行，估计是语料是真的没有。我试过 壁纸，这个prompt看起来不高清。）
+
+<br /> 
+
+试试看别的属性
+
+<br /> 
+
+**插画**
+
+日出, 海面上, 插画  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号插画.png)
+
+<br /> 
+
+插画，其实是什么画风都有，但是总体来说是画。
+
+<br /> 
+
+**油画**
+
+日出, 海面上, 油画  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号油画.png)
+
+<br /> 
+
+虽然图3出现了画框，但是一幅油画，包括了画框也是正常。
+
+<br /> 
+
+**水彩**
+
+日出, 海面上, 水彩  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号水彩.png)
+
+<br /> 
+
+**素描**
+
+日出, 海面上, 素描  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号素描.png)
+
+
+<br />
+
+## 增加细节
+___
+<br /> 
+
+ok，我们回退一下。
+
+<br /> 
+
+日出, 海面上, 4k壁纸  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸.png)
+
+<br /> 
+
+我们希望更多的细节呢？
+
+<br /> 
+
+**复杂**
+
+日出, 海面上, 4k壁纸, 复杂  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸复杂.png)
+
+<br /> 
+
+可以看到，复杂是一定作用的，所有图的细节都增加了。
+
+<br /> 
+
+**精细**
+
+日出, 海面上, 4k壁纸, 精细  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸精细.png)
+
+<br /> 
+
+精细 的做法反而是把不少细节都选择了平滑处理。过度更加柔和。
+
+<br /> 
+
+**高清**
+
+日出, 海面上, 4k壁纸, 高清  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸高清.png)
+
+<br />
+
+只多了一点点细节，图2的海面上多了光斑，这么一说也许是光影效果好了一些。
+
+
+<br />
+
+## 画幅（512×512）
+___
+<br /> 
+
+不同的画幅也会影响生成的内容和质量。
+
+参考自：https://huggingface.co./blog/stable_diffusion
+
+![avatar](img/hf_stable_blog.png)
+
+<br /> 
+
+在stable diffusion中也有这个相关的发现，512*512是最好的画幅。
+
+<br /> 
+
+我们看看正常的：
+
+<br /> 
+
+**512*512**
+
+日出, 海面上, 4k壁纸  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸.png)
+
+<br /> 
+
+**384*384**
+
+日出, 海面上, 4k壁纸  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 384x384, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸384.png)
+
+<br /> 
+
+低画幅会导致画面莫名撕裂，出图非常毛躁。
+
+<br /> 
+
+**256*256**
+
+如果我们进一步降低画质，会非常非常撕裂：  
+
+日出, 海面上, 4k壁纸  
+Negative prompt: 广告, ，, ！, 。, ；, 资讯, 新闻, 水印  
+Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 256x256, Model hash: e2e75020, Batch size: 6, Batch pos: 0
+
+![avatar](img/日出，海面上英文逗号4k壁纸256.png)
+
+# 引用
+
+```
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2021},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
+
+# 版权许可
+
+[Apache License 2.0](LICENSE)
diff --git a/fengshen/examples/stable_diffusion_chinese_EN/README.md b/fengshen/examples/stable_diffusion_chinese_EN/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8bd939d901203225ea6902d688769390c7c10cd8
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese_EN/README.md
@@ -0,0 +1,110 @@
+# Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1
+
+- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM)
+- Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/)
+
+## 简介 Brief Introduction
+
+首个开源的中英双语Stable Diffusion模型，基于0.2亿筛选过的中文图文对训练。
+
+The first open source Chinese&English Bilingual Stable diffusion, which was trained on 20M filtered Chinese image-text pairs.
+
+## 模型分类 Model Taxonomy
+
+|  需求 Demand  | 任务 Task       | 系列 Series      | 模型 Model    | 参数 Parameter | 额外 Extra |
+|  :----:  | :----:  | :----:  | :----:  | :----:  | :----:  |
+| 特殊 Special | 多模态 Multimodal | 太乙 Taiyi | Stable Diffusion |    1B    |     Chinese and English     |
+
+## 模型信息 Model Information
+
+我们将[Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)数据集(100M)和[Zero](https://zero.so.com/)数据集(23M)用作预训练的数据集，先用[IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)对这两个数据集的图文对相似性进行打分，取CLIP Score大于0.2的图文对作为我们的训练集。 我们使用[stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([论文](https://arxiv.org/abs/2112.10752))模型进行继续训练，其中训练分为两个stage。
+
+第一个stage中冻住模型的其他部分，只训练text encoder，以便保留原始模型的生成能力且实现中文概念的对齐。
+
+第二个stage中将全部模型解冻，一起训练text encoder和diffusion model，以便diffusion model更好的适配中文guidance。
+
+第一个stage我们训练了80小时，第二个stage训练了100小时，两个stage都是用了8 x A100。该版本是一个初步的版本，我们将持续优化模型并开源，欢迎交流！
+
+We use [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)(100M) 和 [Zero](https://zero.so.com/)(23M) as our dataset, and take the image and text pairs with CLIP Score (based on [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)) greater than 0.2 as our Training set. We finetune the [stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([paper](https://arxiv.org/abs/2112.10752)) model for two stage. 
+
+Stage 1: To keep the powerful generative capability of stable diffusion and align Chinese concepts with the images, We only train the text encoder and freeze other part of the model in the first stage. 
+
+Stage 2: We unfreeze both the text encoder and the diffusion model, therefore the diffusion model can have a better compatibility for the Chinese language guidance. 
+
+It takes 80 hours to train the first stage, 100 hours to train the second stage, both stages are based on 8 x A100. This model is a preliminary version and we will update this model continuously and open sourse. Welcome to exchange！
+
+### Result
+
+小桥流水人家，Van Gogh style。
+![](result_examples/xiaoqiao_vangogh.png)
+
+小桥流水人家，水彩。
+![](result_examples/xiaoqiao_oil_painting.png)
+
+吃过桥米线的猫。
+![](result_examples/cat_eating_guoqiao_noodle.png)
+
+穿着宇航服的哈士奇。
+![](result_examples/huskiy_wearing_space_suit.png)
+## 使用 Usage
+
+### 全精度 Full precision
+
+```py
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1").to("cuda")
+
+prompt = '小桥流水人家，Van Gogh style'
+image = pipe(prompt, guidance_scale=10).images[0]  
+image.save("小桥.png")
+```
+
+### 半精度 Half precision FP16 (CUDA)
+
+添加 `torch_dtype=torch.float16` 和 `device_map="auto"` 可以快速加载 FP16 的权重，以加快推理速度。
+更多信息见 [the optimization docs](https://huggingface.co./docs/diffusers/main/en/optimization/fp16#half-precision-weights)。
+
+```py
+# !pip install git+https://github.com/huggingface/accelerate
+import torch
+from diffusers import StableDiffusionPipeline
+
+torch.backends.cudnn.benchmark = True
+pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1", torch_dtype=torch.float16)
+pipe.to('cuda')
+
+prompt = '小桥流水人家，Van Gogh style'
+image = pipe(prompt, guidance_scale=10.0).images[0]  
+image.save("小桥.png")
+```
+
+
+## 引用 Citation
+
+如果您在您的工作中使用了我们的模型，可以引用我们的[总论文](https://arxiv.org/abs/2209.02970)：
+
+If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970):
+
+```text
+@article{fengshenbang,
+  author    = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang},
+  title     = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence},
+  journal   = {CoRR},
+  volume    = {abs/2209.02970},
+  year      = {2022}
+}
+```
+
+也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
+
+```text
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2021},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png
new file mode 100644
index 0000000000000000000000000000000000000000..0c28cf33aaba77e00d357110487947d594b23e43
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:200db123a5b4f56315480d9e853e950a9c99b020e652cdc6875b57a90b1df9ae
+size 2420718
diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png
new file mode 100644
index 0000000000000000000000000000000000000000..81ba2b54362dcf93a28e6af2f0a13f9610ad0b82
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40597ef987aaa22d7b4de63f9b280c587fd8d973ba4a62fddf35f3df472134cb
+size 2215674
diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png
new file mode 100644
index 0000000000000000000000000000000000000000..86ae863ffe97e37bbfa0231e9033058c9d44d721
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d7a9780d438506eb151b21d792ff6224f343f39818ed0aadbc786d378b05a54
+size 3006257
diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png
new file mode 100644
index 0000000000000000000000000000000000000000..f18d210aaf71655b907df7b3c39a6902f4e1942c
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:867b1cb80acca6540fd45488b7cee37320c4e4340464a9c2f8d5c2ff76fa8e92
+size 3610124
diff --git a/fengshen/examples/stable_diffusion_dreambooth/duck_result.png b/fengshen/examples/stable_diffusion_dreambooth/duck_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..1104e1d3183cbfe3dfa1f6ef20e56daa75c07482
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_dreambooth/duck_result.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9d752cfd10e54199c248419ac9ea01440527913d5c75227f5e742aa0a5f5787
+size 1409847
diff --git a/fengshen/examples/stable_diffusion_dreambooth/readme.md b/fengshen/examples/stable_diffusion_dreambooth/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e73440bc0b9a933a7aaadf912a33b3899ee3f60
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_dreambooth/readme.md
@@ -0,0 +1,58 @@
+# Taiyi-Stable-Diffusion Dreambooth示例
+
+本示例可以应用于[**IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1**](https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1)在自建的数据集上用[**DreamBooth**](https://arxiv.org/abs/2208.12242)的方法进行特定对象的训练，同时稍微修改代码也能够兼容大部分Stable-Diffusion结构。本示例仅提供参考，有任何疑问或者有需要协助的都可以提Issue到本项目中，会有专门的同学解答~
+
+## 数据处理
+
+在./train_images_duck下有我们进行展示的一个数据集样例
+
+## 配置要求
+
+[**IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1**](https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1)十亿级别参数进行dreambooth训练，我们自己测试的配置基础如下，batch_size设定为1～2，另外也可以参考train_with_prior.sh进行fp16和deepspeed加速。
+
+fp32:
+
+- 显存：26G以上
+- 内存：64G以上
+
+## 运行脚本
+
+标准版本
+
+sh train.sh
+
+增加先验版本，具体可以参考[论文](https://arxiv.org/abs/2208.12242)
+
+sh train_with_prior.sh
+
+在脚本中也提供了丰富的超参供大家修改，例如batch_size, ckpt_path等等都可以根据自己的需求做更改，其中model_path指向的是huggingface上的模型路径，下载可能比较慢，如果用户已经在本地下载过一份权重，直接将model_path改成本地路径即可。
+
+一些常用的参数我们会放在[封神榜的文档里](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/%E5%B0%81%E7%A5%9E%E6%A1%86%E6%9E%B6/%E5%8F%82%E6%95%B0%E7%AE%A1%E7%90%86.html)
+
+有任何不清楚的地方，不要吝啬你的Issue，直接提过来。
+
+## 一些训练中的Trick
+
+### Deepspeed
+
+在示例中我们默认开始了Deepspeed，通过Deepspeed我们能提高不少训练效率（即使是单卡）。并且得益于Zero Redundancy Optimizer的技术，在多卡的环境我们能显著的减少显存占用，提高batch_size以获得更高的效率，强烈建议有条件的同学开启Deepspeed。train_with_prior.sh在40G的A100上需要开启deepspeed.
+
+### 一点经验
+
+- 图片选取质量更高，图片背景尽量选取纯色
+
+- 对于人脸或者更复杂的可以采用较小的学习率，训练更长的step（800-1200） 
+
+- 目训练的效果不是特别稳定，没有论文中如此惊艳的效果，不过参考[太乙webui的配置](https://github.com/IDEA-CCNL/stable-diffusion-webui/blob/master/README.md)还是能找到很多不错的效果图
+
+- 持续探索中...
+
+![结果](duck_result.png)
+
+### 参考资料
+https://arxiv.org/abs/2208.12242
+
+https://dreambooth.github.io/
+
+https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3
+
diff --git a/fengshen/examples/stable_diffusion_dreambooth/requirements.txt b/fengshen/examples/stable_diffusion_dreambooth/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ebb2db2051981e04dc7ab582b27b14904872aae3
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_dreambooth/requirements.txt
@@ -0,0 +1,8 @@
+diffusers>==0.7.2
+torchvision
+transformers>==4.24.0
+pytorch-lightning>==1.8.1
+ftfy
+tensorboard
+modelcards
+deepspeed>==0.5.10
\ No newline at end of file
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train.py b/fengshen/examples/stable_diffusion_dreambooth/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..d783590e4ebb9e8069b6a5bebdd36f0be57309e6
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_dreambooth/train.py
@@ -0,0 +1,276 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   train.py
+@Time    :   2022/11/09 22:27
+@Author  :   Gan Ruyi
+@Version :   1.0
+@Contact :   ganruyi@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+import hashlib
+import itertools
+import os
+from pathlib import Path
+from tqdm.auto import tqdm
+import torch
+import argparse
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+from transformers import BertTokenizer, BertModel, CLIPTokenizer, CLIPTextModel
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from torch.nn import functional as F
+from fengshen.data.dreambooth_datasets.dreambooth_datasets import PromptDataset, DreamBoothDataset
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.data.dreambooth_datasets.dreambooth_datasets import add_data_args
+
+
+class StableDiffusionDreamBooth(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Taiyi Stable Diffusion Module')
+        parser.add_argument('--train_text_encoder', action='store_true', default=False)
+        # dreambooth train unet only default
+        parser.add_argument('--train_unet', action='store_true', default=True)
+        return parent_parser
+
+    def __init__(self, args):
+        super().__init__()
+        if 'Taiyi-Stable-Diffusion-1B-Chinese-v0.1' in args.model_path:
+            self.tokenizer = BertTokenizer.from_pretrained(
+                args.model_path, subfolder="tokenizer")
+            self.text_encoder = BertModel.from_pretrained(
+                args.model_path, subfolder="text_encoder")  # load from taiyi_finetune-v0
+        else:
+            self.tokenizer = CLIPTokenizer.from_pretrained(
+                args.model_path, subfolder="tokenizer")
+            self.text_encoder = CLIPTextModel.from_pretrained(
+                args.model_path, subfolder="text_encoder")
+        self.vae = AutoencoderKL.from_pretrained(
+            args.model_path, subfolder="vae")
+        self.unet = UNet2DConditionModel.from_pretrained(
+            args.model_path, subfolder="unet")
+        self.noise_scheduler = DDPMScheduler.from_config(
+            args.model_path, subfolder="scheduler")
+
+        # set model
+        self.vae.requires_grad_(False)
+        if not args.train_text_encoder:
+            self.requires_grad_(False)
+        if not args.train_unet:
+            self.requires_grad_(False)
+
+        self.save_hyperparameters(args)
+
+    def generate_extra_data(self):
+        global_rank = self.global_rank
+        device = self.trainer.device_ids[global_rank]
+        print('generate on device {} of global_rank {}'.format(device, global_rank))
+        class_images_dir = Path(self.hparams.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < self.hparams.num_class_images:
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                self.hparams.model_path,
+                safety_checker=None,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = self.hparams.num_class_images - cur_class_images
+            print(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(self.hparams.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=self.hparams.sample_batch_size)
+
+            pipeline.to(device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=global_rank != 0
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            # if torch.cuda.is_available():
+            #     torch.cuda.empty_cache()
+
+    def setup(self, stage) -> None:
+        if self.hparams.with_prior_preservation:
+            self.generate_extra_data()
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+
+    def configure_optimizers(self):
+        model_params = []
+        if self.hparams.train_unet and self.hparams.train_text_encoder:
+            model_params = itertools.chain(self.unet.parameters(), self.text_encoder.parameters())
+        elif self.hparams.train_unet:
+            model_params = self.unet.parameters()
+        elif self.hparams.train_text_encoder:
+            model_params = self.text_encoder.parameters()
+        return configure_optimizers(self, model_params=model_params)
+
+    def training_step(self, batch, batch_idx):
+        if self.hparams.train_text_encoder:
+            self.text_encoder.train()
+        if self.hparams.train_unet:
+            self.unet.train()
+
+        latents = self.vae.encode(batch["pixel_values"]).latent_dist.sample()
+        latents = latents * 0.18215
+
+        # Sample noise that we'll add to the latents
+        noise = torch.randn(latents.shape).to(latents.device)
+        noise = noise.to(dtype=self.unet.dtype)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+
+        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        noisy_latents = noisy_latents.to(dtype=self.unet.dtype)
+
+        # Get the text embedding for conditioning
+        # with torch.no_grad():
+        encoder_hidden_states = self.text_encoder(batch["input_ids"])[0]
+
+        # Predict the noise residual
+        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+        if self.hparams.with_prior_preservation:
+            # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+            noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+            noise, noise_prior = torch.chunk(noise, 2, dim=0)
+            # Compute instance loss
+            loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+            # Compute prior loss
+            prior_loss = F.mse_loss(noise_pred_prior, noise_prior, reduction="mean")
+            # Add the prior loss to the instance loss.
+            loss = loss + args.prior_loss_weight * prior_loss
+        else:
+            loss = F.mse_loss(noise_pred, noise, reduction="mean")
+        self.log("train_loss", loss.item(),  on_epoch=False, prog_bar=True, logger=True)
+
+        if self.trainer.global_rank == 0:
+            if (self.global_step+1) % 5000 == 0:
+                print('saving model...')
+                pipeline = StableDiffusionPipeline.from_pretrained(
+                    args.model_path, unet=self.unet, text_encoder=self.text_encoder, tokenizer=self.tokenizer,
+                )
+                pipeline.save_pretrained(os.path.join(
+                    args.default_root_dir, f'hf_out_{self.trainer.current_epoch}'))
+
+        return {"loss": loss}
+
+    def on_train_end(self) -> None:
+        if self.trainer.global_rank == 0:
+            print('saving model...')
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.model_path, unet=self.unet, text_encoder=self.text_encoder, tokenizer=self.tokenizer,
+            )
+            pipeline.save_pretrained(os.path.join(
+                args.default_root_dir, f'hf_out_{self.trainer.current_epoch}'))
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = add_data_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = StableDiffusionDreamBooth.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    model = StableDiffusionDreamBooth(args)
+
+    tokenizer = model.tokenizer
+    datasets = DreamBoothDataset(
+        instance_data_dir=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        tokenizer=tokenizer,
+        class_data_dir=args.class_data_dir,
+        class_prompt=args.class_prompt,
+        size=512,
+        center_crop=args.center_crop,
+    )
+    # construct the datasets to a dict for universal_datamodule
+    datasets = {'train': datasets}
+
+    def collate_fn(examples):
+        # print(examples)
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+
+        return batch
+
+    datamodule = UniversalDataModule(
+        tokenizer=tokenizer, collate_fn=collate_fn, args=args, datasets=datasets)
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    trainer = Trainer.from_argparse_args(args,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    trainer.fit(model, datamodule, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train.sh b/fengshen/examples/stable_diffusion_dreambooth/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ad3eb7ead394e6662168eb0b4947055277a01b58
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_dreambooth/train.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#SBATCH --job-name=taiyi-sd-dreambooth # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=1 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+ROOT_DIR=../../workspace
+# export CUDA_VISIBLE_DEVICES='7'
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=taiyi-sd-dreambooth
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=1
+
+MICRO_BATCH_SIZE=1
+INSTANCE_PROMPT="小黄鸭"
+OUTPUT_DIR="saved_model_tinyduck"
+INSTANCE_DIR="train_images_duck"
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --instance_data_dir=$INSTANCE_DIR \
+        --instance_prompt=$INSTANCE_PROMPT \
+        --resolution=512 \
+        "
+
+MODEL_ARGS="\
+        --model_path $MODEL_ROOT_DIR/pretrain/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/ \
+        --train_text_encoder \
+        --learning_rate 1e-6 \
+        --scheduler_type constant \
+        --warmup_steps 100 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+        --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \
+        "
+
+TRAINER_ARGS="\
+        --max_steps 1200 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy ddp \
+        --log_every_n_steps 100 \
+        --precision 32 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        --num_sanity_val_steps 0 \
+        --limit_val_batches 0 \
+        "
+# num_sanity_val_steps， limit_val_batches 通过这俩参数把validation关了
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+# run local
+python train.py $options
+# run on slurm
+# srun python train.py $options
\ No newline at end of file
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/02e791d8e91ddc2040e96675ab6873a.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/02e791d8e91ddc2040e96675ab6873a.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..79a9894f17d1a4e06a2e06ee1fab125d030008da
Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/02e791d8e91ddc2040e96675ab6873a.jpg differ
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/ab1acecf23c6809a0fb12ffb169c795.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/ab1acecf23c6809a0fb12ffb169c795.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fca7dc4c48c4ae2e67942f304146227c6ea7a261
Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/ab1acecf23c6809a0fb12ffb169c795.jpg differ
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/f2595677df44dddae46f23578ea91e9.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/f2595677df44dddae46f23578ea91e9.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5a6aa286d6d55c3e6340457ac3e97fa3a7467b83
Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/f2595677df44dddae46f23578ea91e9.jpg differ
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/fa936e11c9f4419e91ad57d5041f739.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/fa936e11c9f4419e91ad57d5041f739.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7b29eaeb52beace4f09b198f0b362ecd27b59283
Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/fa936e11c9f4419e91ad57d5041f739.jpg differ
diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_with_prior.sh b/fengshen/examples/stable_diffusion_dreambooth/train_with_prior.sh
new file mode 100644
index 0000000000000000000000000000000000000000..623972b04949ed5b81eb708f1f3b908907100db4
--- /dev/null
+++ b/fengshen/examples/stable_diffusion_dreambooth/train_with_prior.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+#SBATCH --job-name=taiyi-sd-dreambooth # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks-per-node=2 # number of tasks to run per node
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
+#SBATCH -x dgx050
+
+# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen
+ROOT_DIR=../../workspace
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions
+
+MODEL_NAME=taiyi-sd-dreambooth-prior
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+fi
+
+NNODES=1
+GPUS_PER_NODE=2
+MICRO_BATCH_SIZE=2
+# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin
+CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json"
+ZERO_STAGE=1
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $CONFIG_JSON
+{
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE}
+    },
+     "fp16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE
+}
+EOT
+export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON
+### End
+
+INSTANCE_PROMPT="[小黄鸭]"
+OUTPUT_DIR="saved_model_duck2"
+INSTANCE_DIR="train_images_duck"
+
+CLASS_PROMPT="小黄鸭"
+CLASS_DIR="class_images_duck"
+
+DATA_ARGS="\
+        --dataloader_workers 2 \
+        --train_batchsize $MICRO_BATCH_SIZE  \
+        --val_batchsize $MICRO_BATCH_SIZE \
+        --test_batchsize $MICRO_BATCH_SIZE  \
+        --instance_data_dir=$INSTANCE_DIR \
+        --instance_prompt=$INSTANCE_PROMPT \
+        --class_prompt=$CLASS_PROMPT \
+        --class_data_dir=$CLASS_DIR \
+        --with_prior_preservation --prior_loss_weight=1.0 \
+        --num_class_images=200 \
+        --resolution=512 \
+        --sample_batch_size=1 \
+        "
+
+MODEL_ARGS="\
+        --model_path $MODEL_ROOT_DIR/pretrain/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/ \
+        --train_text_encoder \
+        --learning_rate 1e-6 \
+        --scheduler_type constant \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --every_n_epochs 100 \
+        --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+        --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 200 \
+        --gpus $GPUS_PER_NODE \
+        --num_nodes $NNODES \
+        --strategy deepspeed_stage_${ZERO_STAGE} \
+        --log_every_n_steps 100 \
+        --precision 16 \
+        --default_root_dir ${MODEL_ROOT_DIR} \
+        --replace_sampler_ddp False \
+        --num_sanity_val_steps 0 \
+        --limit_val_batches 0 \
+        "
+# num_sanity_val_steps， limit_val_batches 通过这俩参数把validation关了
+
+export options=" \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+        "
+# run local
+# python train.py $options
+# run on slurm
+srun python train.py $options
\ No newline at end of file
diff --git a/fengshen/examples/summary/pretrain_bart_summary.sh b/fengshen/examples/summary/pretrain_bart_summary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f8a6af24f935cc563891922b8a50cd293231367b
--- /dev/null
+++ b/fengshen/examples/summary/pretrain_bart_summary.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#SBATCH --job-name=bart_summary
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4               # number of gpus
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+MODEL_NAME=bart-base
+MICRO_BATCH_SIZE=16
+ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME}
+
+ZERO_STAGE=1
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions
+config_json="./ds_config.${MODEL_NAME}.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 5e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-4
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+# export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+TRAINER_ARGS="
+    --max_epochs 2 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy deepspeed_stage_${ZERO_STAGE} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor val_loss \
+    --mode min \
+    --save_last \
+    --every_n_train_steps 0 \
+    --val_check_interval 0.1 \
+"
+
+prompt='"'
+DATA_ARGS="
+    --datasets_name lcsts \
+    --num_workers 8 \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --max_enc_length 128 \
+    --max_dec_length 64 \
+    --val_datasets_field val \
+    --prompt $prompt \
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/gaoxinyu/pretrained_model/bart-base \
+    --output_save_path $ROOT_DIR/${MODEL_NAME}_predict_lcsts.json \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --precision 16 \
+"
+
+SCRIPTS_PATH=seq2seq_summary.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+#singularity exec --nv -B /cognitive_comp/ganruyi/Megatron/:/cognitive_comp/ganruyi/Megatron/,/cognitive_comp/gaoxinyu/:/cognitive_comp/gaoxinyu/ $SINGULARITY_PATH python $CMD
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+# srun --nodes=1 --gres=gpu:4 --ntasks-per-node=4 --cpus-per-gpu=20 
+source activate
+conda activate torchnew
+srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229623 bash -c 'python3 $SCRIPT_PATH $CMD'
diff --git a/fengshen/examples/summary/randeng_pegasus_523M_summary.sh b/fengshen/examples/summary/randeng_pegasus_523M_summary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..10f6d29a6acd1fe70117d0f1b8d33ce58cdb1384
--- /dev/null
+++ b/fengshen/examples/summary/randeng_pegasus_523M_summary.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=randeng_pegasus_523M_summary
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH --cpus-per-task=30
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+MODEL_NAME=randeng_pegasus_523M_summary_last
+MICRO_BATCH_SIZE=128
+ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME}
+
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+output_save_path=$ROOT_DIR/${MODEL_NAME}.json
+if [ -f ${output_save_path} ];then
+  echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!!
+  rm ${output_save_path}
+fi
+
+ZERO_STAGE=1
+
+config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 1000,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 5e-5,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_min_lr": 1e-8,
+      "warmup_max_lr": 1e-4,
+      "total_num_steps": 60000,
+      "warmup_num_steps" : 1000
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions
+# export MASTER_PORT=$[RANDOM%10000+50000]
+# 
+# --strategy deepspeed_stage_${ZERO_STAGE} \
+TRAINER_ARGS="
+    --max_epochs 10 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy deepspeed_stage_${ZERO_STAGE} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor val_loss \
+    --mode min \
+    --save_last \
+    --every_n_train_steps 10000 \
+    --val_check_interval 0.1 \
+"
+prompt='"'
+DATA_ARGS="
+    --datasets_name lcsts \
+    --num_workers 30 \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --max_enc_length 128 \
+    --max_dec_length 64 \
+    --val_datasets_field val \
+    --prompt $prompt \
+"
+
+# --prompt $prompt \
+# --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_77M_summary/ckpt/hf_pretrained_epoch1_step75019 \
+
+# mode_path="/cognitive_comp/dongxiaoqun/train_model/fengshen-pegasus-base/ckpt/hf_pretrained_epoch0_step22200/"
+mode_path="/cognitive_comp/dongxiaoqun/train_model/fengshen-pegasus-large/ckpt/hf_pretrained_epoch0_step122000"
+cp /cognitive_comp/dongxiaoqun/pretrained_model/pegasus-large/vocab.txt $mode_path/
+
+MODEL_ARGS="
+    --pretrained_model_path  $mode_path \
+    --output_save_path $output_save_path \
+    --self_tokenizer \
+"
+
+SCRIPTS_PATH=/cognitive_comp/dongxiaoqun/debug/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+source activate
+conda activate torchnew
+srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229555 bash -c 'python3 $SCRIPT_PATH $CMD'
+
diff --git a/fengshen/examples/summary/randeng_t5_70M_summary.sh b/fengshen/examples/summary/randeng_t5_70M_summary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..403d8d4dd022bf90fe9f50854291ec4e48f13aff
--- /dev/null
+++ b/fengshen/examples/summary/randeng_t5_70M_summary.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+#SBATCH --job-name=randeng_t5_77M_summary
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --gres=gpu:2               # number of gpus
+#SBATCH --cpus-per-task=30
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+MODEL_NAME=randeng_t5_77M_summary_test2
+MICRO_BATCH_SIZE=64
+ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+output_save_path=$ROOT_DIR/${MODEL_NAME}.json
+if [ -f ${output_save_path} ];then
+  echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!!
+  rm ${output_save_path}
+fi
+ZERO_STAGE=1
+
+config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 60000,
+      "warmup_num_steps" : 500
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions
+# export MASTER_PORT=$[RANDOM%10000+30000]
+# export PL_FAULT_TOLERANT_TRAINING=1
+
+TRAINER_ARGS="
+    --max_epochs 2 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy deepspeed_stage_${ZERO_STAGE} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor val_loss \
+    --mode min \
+    --save_last \
+    --every_n_train_steps 0 \
+    --val_check_interval 0.1 \
+"
+
+prompt="summary:"
+DATA_ARGS="
+    --datasets_name lcsts \
+    --num_workers 30 \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --max_enc_length 128 \
+    --max_dec_length 64 \
+    --val_datasets_field val \
+    --prompt $prompt \
+"
+# --prompt $prompt \
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_77M/ckpt/hf_pretrained_epoch0_step183100 \
+    --output_save_path $ROOT_DIR/randeng_t5_77M_predict_lcsts.json \
+"
+
+SCRIPTS_PATH=/cognitive_comp/dongxiaoqun/debug/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+echo $CMD
+# python $CMD
+
+source activate
+conda activate torchnew
+srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229623 bash -c 'python3 $SCRIPT_PATH $CMD'
diff --git a/fengshen/examples/summary/randeng_t5_70M_summary_predict.sh b/fengshen/examples/summary/randeng_t5_70M_summary_predict.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ccbf410fa92b1d5e09c97d6ae3af7bb4ff121c64
--- /dev/null
+++ b/fengshen/examples/summary/randeng_t5_70M_summary_predict.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+#SBATCH --job-name=randeng_t5_77M_summary_predict
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --gres=gpu:2               # number of gpus
+#SBATCH --cpus-per-task=30
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+MODEL_NAME=randeng_t5_77M_summary_predict
+MICRO_BATCH_SIZE=16
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/${MODEL_NAME}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+output_save_path=$ROOT_DIR/randeng_t5_77M_predict_lcsts.json
+if [ -f ${output_save_path} ];then
+  echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!!
+  rm ${output_save_path}
+fi
+
+ZERO_STAGE=1
+
+config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-8,
+      "weight_decay": 5e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-4
+    }
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+export MASTER_PORT=$[RANDOM%10000+50000]
+
+# --strategy deepspeed_stage_${ZERO_STAGE} \
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 2 \
+    --num_nodes 1 \
+    --strategy ddp \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+    --every_n_train_steps 0 \
+"
+DATA_DIR=/cognitive_comp/ganruyi/data_datasets_LCSTS_LCSTS/
+prompt="summary:"
+DATA_ARGS="
+    --datasets_name lcsts \
+    --num_workers 30 \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --max_enc_length 128 \
+    --max_dec_length 64 \
+    --val_datasets_field val \
+    --prompt $prompt \
+"
+# --prompt $prompt \
+# --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_77M_summary/ckpt/hf_pretrained_epoch1_step75019 \
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/gaoxinyu/pretrained_model/bart-759M \
+    --output_save_path $ROOT_DIR/randeng_t5_77M_predict_lcsts.json \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --precision 16 \
+    --warmup 0.01 \
+    --do_eval_only \
+    --max_dec_length 32 \
+"
+
+SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+echo $CMD
+source activate base
+# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+python $CMD
\ No newline at end of file
diff --git a/fengshen/examples/summary/randeng_t5_784M_summary.sh b/fengshen/examples/summary/randeng_t5_784M_summary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5b3e60c8784ac563eff09763591e00b6d250444f
--- /dev/null
+++ b/fengshen/examples/summary/randeng_t5_784M_summary.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+#SBATCH --job-name=randeng_t5_77M_summary
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --gres=gpu:2               # number of gpus
+#SBATCH --cpus-per-task=30
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+MODEL_NAME=randeng_t5_784M_summary
+MICRO_BATCH_SIZE=8
+ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+ZERO_STAGE=1
+
+config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-4,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": 1e-04,
+      "warmup_min_lr": 1e-05,
+      "total_num_steps": 60000,
+      "warmup_num_steps" : 500
+    },
+    "type": "WarmupDecayLR"  
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions
+# export MASTER_PORT=$[RANDOM%10000+30000]
+# export PL_FAULT_TOLERANT_TRAINING=1
+
+TRAINER_ARGS="
+    --max_epochs 1 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy deepspeed_stage_${ZERO_STAGE} \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor val_loss \
+    --mode min \
+    --save_last \
+    --every_n_train_steps 0 \
+    --val_check_interval 0.1 \
+"
+
+prompt="summary:"
+DATA_ARGS="
+    --datasets_name lcsts \
+    --num_workers 30 \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --max_enc_length 128 \
+    --max_dec_length 64 \
+    --val_datasets_field val \
+    --prompt $prompt \
+"
+# --prompt $prompt \
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_large_v2/ckpt/hf_pretrained_epoch0_step732500 \
+    --output_save_path $ROOT_DIR/randeng_t5_784M_predict_lcsts.json \
+"
+
+SCRIPTS_PATH=/cognitive_comp/dongxiaoqun/debug/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py
+SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+echo $CMD
+
+source activate
+conda activate torchnew
+srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229668 bash -c 'python3 $SCRIPT_PATH $CMD'
+# source activate base
+# python $CMD
+
+# srun --jobid=229668 --nodes=1 --gres=gpu:1 --ntasks-per-node=1 --cpus-per-task=30 -e ${ROOT_DIR}/${MODEL_NAME}-%j.err -o ${ROOT_DIR}/${MODEL_NAME}-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
+
+# srun python $CMD
+# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD'
diff --git a/fengshen/examples/summary/seq2seq_summary.py b/fengshen/examples/summary/seq2seq_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c725c215d61dc5c6fa0fbf6603b7f06f0a317b
--- /dev/null
+++ b/fengshen/examples/summary/seq2seq_summary.py
@@ -0,0 +1,197 @@
+
+import torch
+import os
+import argparse
+import json
+import pytorch_lightning as pl
+from fengshen.models.model_utils import add_module_args
+from fengshen.data.task_dataloader.task_datasets import AbstractCollator
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.utils.utils import chinese_char_tokenize
+from torchmetrics.text.rouge import ROUGEScore
+from pytorch_lightning import Trainer, loggers
+from pytorch_lightning.callbacks import LearningRateMonitor
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import sys
+sys.path.append('../../../')
+
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = '3,4'
+
+
+class FinetuneSummary(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--rouge_keys', default='rougeL,rouge1,rouge2', type=str)
+        return parent_args
+
+    def __init__(self, args, tokenizer=None):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(
+            args.pretrained_model_path)
+        self.tokenizer = tokenizer
+        assert self.tokenizer, "tokenizer is None!"
+        self.rouge_keys = tuple(args.rouge_keys.split(','))
+        self.rouge_metric = ROUGEScore(rouge_keys=self.rouge_keys, normalizer=lambda x: x)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus)
+            ab_size = self.trainer.accumulate_grad_batches * \
+                float(self.trainer.max_epochs)
+            self.total_steps = (
+                len(train_loader.dataset) // tb_size) // ab_size
+            print('total_steps is :', self.total_steps)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'], labels=batch['labels'])
+        self.log('train_loss', output.loss, sync_dist=True)
+        return output.loss
+
+    def on_validation_start(self) -> None:
+        # rm file at validation start
+        prefix, ext = os.path.splitext(self.hparams.output_save_path)
+        file_path_rank = '{}_{}{}'.format(
+            prefix, self.trainer._accelerator_connector.cluster_environment.global_rank(), ext)
+        if os.path.exists(file_path_rank):
+            print('rm {}'.format(file_path_rank))
+            os.remove(file_path_rank)
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'], labels=batch['labels'])
+        generated_ids = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            max_length=self.hparams.max_dec_length
+        )
+
+        preds = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        labels = torch.where(batch['labels'] != -100, batch['labels'],
+                             self.tokenizer.pad_token_id)
+        labels = self.tokenizer.batch_decode(
+            labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        # save preds for every rank
+        prefix, ext = os.path.splitext(self.hparams.output_save_path)
+        file_path_rank = '{}_{}{}'.format(
+            prefix, self.trainer._accelerator_connector.cluster_environment.global_rank(), ext)
+        self.save_prediction_to_file(preds=preds, texts=batch['text'],
+                                     summarys=batch['summary'], file_path=file_path_rank)
+        # you need to split chinese char with space for rouge metric
+        new_preds = [chinese_char_tokenize(p) for p in preds]
+        new_labels = [chinese_char_tokenize(label) for label in labels]
+        # update metric
+        self.rouge_metric.update(preds=new_preds, target=new_labels)
+        self.log('val_loss', output.loss, sync_dist=True)
+
+    def validation_epoch_end(self, outputs):
+        # compute metric for all process
+        rouge_dict = self.rouge_metric.compute()
+        # reset the metric after once validation
+        self.rouge_metric.reset()
+        for k, v in rouge_dict.items():
+            self.log('val_{}'.format(k), v, sync_dist=True)
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            print('rouge:\n', rouge_dict)
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            self.model.save_pretrained(os.path.join(
+                self.trainer.checkpoint_callback.dirpath,
+                'hf_pretrained_epoch{}_step{}'.format(checkpoint['epoch'], checkpoint['global_step'])))
+
+    def save_prediction_to_file(self, preds, texts, summarys, file_path):
+        with open(file_path, 'a', encoding='utf-8') as f:
+            for idx, pred in enumerate(preds):
+                text = texts[idx]
+                summary = summarys[idx]
+                tmp_result = dict()
+                tmp_result['pred'] = pred
+                tmp_result['label'] = summary
+                tmp_result['text'] = text
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data + '\n')
+
+    def predict_step(self, batch, batch_idx):
+        # print(batch)
+        texts = batch['text']
+        # output summary and metrics
+        generated_ids = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            max_length=self.hparams.max_dec_length
+        )
+        preds = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        labels = self.tokenizer.batch_decode(
+            batch['labels'], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        print(batch_idx, len(preds), len(labels))
+        self.save_prediction_to_file(preds, texts, labels)
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Summary Task")
+    total_parser.add_argument('--do_eval_only',
+                              action='store_true',
+                              default=False)
+    total_parser.add_argument('--pretrained_model_path',
+                              default='google/mt5-small',
+                              type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json',
+                              type=str)
+    total_parser.add_argument('--self_tokenizer',
+                              action='store_true',
+                              default=False)
+    total_parser.add_argument('--max_enc_length', default=1024, type=int)
+    total_parser.add_argument('--max_dec_length', default=256, type=int)
+    total_parser.add_argument('--prompt', default='summarize:', type=str)
+    # * Args for data preprocessing
+    # from fengshen.data.task_dataloader.task_datasets import LCSTSDataModel
+    total_parser = UniversalDataModule.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = add_module_args(total_parser)
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+    total_parser = FinetuneSummary.add_model_specific_args(total_parser)
+    # * Args for base model
+    args = total_parser.parse_args()
+
+    if args.self_tokenizer:
+        from fengshen.examples.pegasus.tokenizers_pegasus import PegasusTokenizer
+        tokenizer = PegasusTokenizer.from_pretrained(args.pretrained_model_path)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_path, use_fast=False)
+    collator = AbstractCollator(tokenizer, args.max_enc_length,
+                                args.max_dec_length, args.prompt)
+    data_model = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collator)
+    model = FinetuneSummary(args, tokenizer)
+    if not args.do_eval_only:
+        lr_monitor = LearningRateMonitor(logging_interval='step')
+        logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+            args.default_root_dir, 'log/'))
+        checkpoint_callback = UniversalCheckpoint(args)
+        trainer = Trainer.from_argparse_args(args,
+                                             logger=logger,
+                                             callbacks=[lr_monitor,
+                                                        checkpoint_callback]
+                                             )
+        trainer.fit(model, data_model)
+    else:
+        trainer = Trainer.from_argparse_args(args)
+        # trainer.predict(model, data_model)
+        trainer.validate(model, data_model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/tcbert/README.md b/fengshen/examples/tcbert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6f6b38e2b9cc6978962927bb0e8568b46da28f0
--- /dev/null
+++ b/fengshen/examples/tcbert/README.md
@@ -0,0 +1,145 @@
+[**中文**](./README.md)
+
+# TCBert
+论文 《[TCBERT: A Technical Report for Chinese Topic Classification BERT](https://arxiv.org/abs/2211.11304)》源码
+
+## Requirements
+
+安装 fengshen 框架
+
+```shell
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+pip install --editable .
+```
+
+## Quick Start
+
+你可以参考我们的 [example.py](./example.py) 脚本，只需要将处理好的 ```train_data```、```dev_data```、```test_data```、 ```prompt```、```prompt_label``` ，输入模型即可。
+```python
+import argparse
+from fengshen.pipelines.tcbert import TCBertPipelines
+from pytorch_lightning import seed_everything
+
+total_parser = argparse.ArgumentParser("Topic Classification")
+total_parser = TCBertPipelines.piplines_args(total_parser)
+args = total_parser.parse_args()
+    
+pretrained_model_path = 'IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese'
+args.learning_rate = 2e-5
+args.max_length = 512
+args.max_epochs = 3
+args.batchsize = 1
+args.train = 'train'
+args.default_root_dir = './'
+# args.gpus = 1   #注意：目前使用CPU进行训练，取消注释会使用GPU，但需要配置相应GPU环境版本
+args.fixed_lablen = 2 #注意：可以设置固定标签长度，由于样本对应的标签长度可能不一致，建议选择合适的数值表示标签长度
+
+train_data = [
+        {"content": "凌云研发的国产两轮电动车怎么样，有什么惊喜？", "label": "科技",}
+    ]
+
+dev_data = [
+    {"content": "我四千一个月，老婆一千五一个月，存款八万且有两小孩，是先买房还是先买车？","label": "汽车",}
+]
+    
+test_data = [
+    {"content": "街头偶遇2018款长安CS35，颜值美炸！或售6万起，还买宝骏510？"}
+]
+
+prompt = "下面是一则关于{}的新闻："
+
+prompt_label = {"汽车":"汽车", "科技":"科技"}
+
+model = TCBertPipelines(args, model_path=pretrained_model_path, nlabels=len(prompt_label))
+
+if args.train:
+    model.train(train_data, dev_data, prompt, prompt_label)
+result = model.predict(test_data, prompt, prompt_label)
+```
+
+
+## Pretrained Model
+为了提高模型在话题分类上的效果，我们收集了大量话题分类数据进行基于`prompt`的预训练。我们已经将预训练模型开源到 ```HuggingFace``` 社区当中。
+
+| 模型 | 地址   |
+|:---------:|:--------------:|
+| Erlangshen-TCBert-110M-Classification-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese)   |
+| Erlangshen-TCBert-330M-Classification-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Classification-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Classification-Chinese)       |
+| Erlangshen-TCBert-1.3B-Classification-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Classification-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Classification-Chinese)   |
+| Erlangshen-TCBert-110M-Sentence-Embedding-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Sentence-Embedding-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Sentence-Embedding-Chinese)       |
+| Erlangshen-TCBert-330M-Sentence-Embedding-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Sentence-Embedding-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Sentence-Embedding-Chinese)       |
+| Erlangshen-TCBert-1.3B-Sentence-Embedding-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Sentence-Embedding-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Sentence-Embedding-Chinese)       |
+
+## Experiments
+
+对每个不同的数据集，选择合适的模板```Prompt```
+Dataset      | Prompt    
+|------------|------------|
+| TNEWS | 下面是一则关于{}的新闻：       |
+| CSLDCP | 这一句描述{}的内容如下：       |
+| IFLYTEK | 这一句描述{}的内容如下：       |
+
+使用上述```Prompt```的实验结果如下：
+| Model      | TNEWS    | CLSDCP   | IFLYTEK     |  
+|------------|------------|----------|-----------|
+| Macbert-base | 55.02       | 57.37     | 51.34        | 
+| Macbert-large | 55.77	     | 58.99     | 	50.31         | 
+| Erlangshen-1.3B | 57.36       | 62.35     | 53.23       | 
+| TCBert-base-110M-Classification-Chinese | 55.57       | 58.60     | 49.63        | 
+| TCBert-large-330M-Classification-Chinese | 56.17       | 61.23     | 51.34        | 
+| TCBert-1.3B-Classification-Chinese | 57.41       | 65.10    | 53.75        | 
+| TCBert-base-110M-Sentence-Embedding-Chinese | 54.68       | 59.78     | 49.40        | 
+| TCBert-large-330M-Sentence-Embedding-Chinese | 55.32       | 62.07     | 51.11        | 
+| TCBert-1.3B-Sentence-Embedding-Chinese | 57.46       | 65.04     | 53.06        | 
+
+## Dataset
+
+需要您提供：```训练集```、```验证集```、```测试集```、```Prompt```、```标签映射```五个数据，对应的数据格式如下：
+
+#### 训练数据 示例
+必须包含```content```和```label```字段
+```json
+[{
+    "content": "街头偶遇2018款长安CS35，颜值美炸！或售6万起，还买宝骏510？",   
+    "label": "汽车"
+}]
+```
+
+#### 验证数据 示例
+必须包含```content```和```label```字段
+```json
+[{
+    "content": "宁夏邀深圳市民共赴“寻找穿越”之旅",
+    "label": "旅游"
+}]
+```
+
+#### 测试数据 示例
+必须包含```content```字段
+```json
+[{
+    "content": "买涡轮增压还是自然吸气车？今天终于有答案了！"
+}]
+```
+#### Prompt 示例
+可以选择任一模版，模版的选择会对模型效果产生影响，其中必须包含```{}```，作为标签占位符
+```json
+"下面是一则关于{}的新闻："
+```
+
+#### 标签映射 示例
+可以将真实标签映射为更合适Prompt的标签，支持映射后的标签长度不一致
+```json
+{
+    "汽车": "汽车", 
+    "旅游": "旅游", 
+    "经济生活": "经济生活",
+    "房产新闻": "房产"
+}
+```
+
+## License
+
+[Apache License 2.0](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/LICENSE)
+
diff --git a/fengshen/examples/tcbert/__init__.py b/fengshen/examples/tcbert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fengshen/examples/tcbert/example.py b/fengshen/examples/tcbert/example.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eff218461c65f40ec88e9ea2c7e0cdbe1d05082
--- /dev/null
+++ b/fengshen/examples/tcbert/example.py
@@ -0,0 +1,86 @@
+import argparse
+from fengshen.pipelines.tcbert import TCBertPipelines
+from pytorch_lightning import seed_everything
+
+def main():
+    seed_everything(123)
+    total_parser = argparse.ArgumentParser("Topic Classification")
+    total_parser = TCBertPipelines.piplines_args(total_parser)
+    args = total_parser.parse_args()
+
+    pretrained_model_path = 'IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese'
+    args.learning_rate = 2e-5
+    args.max_length = 512
+    args.max_epochs = 5
+    args.batchsize = 4
+    args.train = 'train'
+    args.default_root_dir = './'
+    # args.gpus = 1   #注意：目前使用CPU进行训练，取消注释会使用GPU，但需要配置相应GPU环境版本
+    args.fixed_lablen = 2 #注意：可以设置固定标签长度，由于样本对应的标签长度可能不一致，建议选择适中的数值表示标签长度
+
+    train_data = [    # 训练数据
+        {"content": "真正的放养教育，放的是孩子的思维，养的是孩子的习惯", "label": "故事"},
+        {"content": "《唐人街探案》捧红了王宝强跟刘昊然，唯独戏份不少的他发展最差", "label": "娱乐"},
+        {"content": "油价攀升 阿曼经济加速增长", "label": "财经"},
+        {"content": "日本男篮近期动作频频，中国队的未来劲敌会是他们吗？", "label": "体育"},
+        {"content": "教育部：坚决防止因撤并乡村小规模学校导致学生上学困难", "label": "教育"},
+        {"content": "LOL设计最完美的三个英雄，玩家们都很认可！", "label": "电竞"},
+        {"content": "上联：浅看红楼终是梦，怎么对下联？", "label": "文化"},
+        {"content": "楼市再出新政！北京部分限房价项目或转为共有产权房", "label": "房产"},
+        {"content": "企业怎样选云服务器？云服务器哪家比较好？", "label": "科技"},
+        {"content": "贝纳利的三缸车TRE899K、TRE1130K华丽转身", "label": "汽车"},
+        {"content": "如何评价：刘姝威的《严惩做空中国股市者》？", "label": "股票"},
+        {"content": "宁夏邀深圳市民共赴“寻找穿越”之旅", "label": "旅游"},
+        {"content": "日本自民党又一派系力挺安倍 称会竭尽全力", "label": "国际"},
+        {"content": "农村养老保险每年交5000，交满15年退休后能每月领多少钱？", "label": "农业"},
+        {"content": "国产舰载机首次现身，进度超过预期，将率先在滑跃航母测试", "label": "军事"}
+    ]
+
+    dev_data = [     # 验证数据
+        {"content": "西游记后传中，灵儿最爱的女人是谁？不是碧游！", "label": "故事"},
+        {"content": "小李子莱奥纳多有特别的提袋子技能，这些年他还有过哪些神奇的造型？", "label": "娱乐"},
+        {"content": "现在手上有钱是投资买房还是存钱，为什么？", "label": "财经"},
+        {"content": "迪卡侬的衣服值得购买吗？", "label": "体育"},
+        {"content": "黑龙江省旅游委在齐齐哈尔组织举办导游培训班", "label": "教育"},
+        {"content": "《王者荣耀》中，哪些英雄的大招最“废柴”？", "label": "电竞"},
+        {"content": "上交演绎马勒《复活》，用音乐带来抚慰和希望", "label": "文化"},
+        {"content": "All in服务业，58集团在租房、住房市场的全力以赋", "label": "房产"},
+        {"content": "为什么有的人宁愿选择骁龙660的X21，也不买骁龙845的小米MIX2S？", "label": "科技"},
+        {"content": "众泰大型SUV来袭，售13.98万，2.0T榨出231马力，汉兰达要危险了", "label": "汽车"},
+        {"content": "股票放量下趺，大资金出逃谁在接盘？", "label": "股票"},
+        {"content": "广西博白最大的特色是什么？", "label": "旅游"},
+        {"content": "特朗普退出《伊朗核协议》，对此你怎么看？", "label": "国际"},
+        {"content": "卖水果利润怎么样？", "label": "农业"},
+        {"content": "特种兵都是身材高大的猛男么？别再被电视骗了，超过1米8都不合格", "label": "军事"}
+    ]
+
+    test_data = [    # 测试数据
+        {"content": "廖凡重出“江湖”再争影帝 亮相戛纳红毯霸气有型"},
+        {"content": "《绝地求生: 刺激战场》越玩越卡？竟是手机厂商没交“保护费”!"},
+        {"content": "买涡轮增压还是自然吸气车？今天终于有答案了！"},
+    ]
+
+    #标签映射  将真实标签可以映射为更合适prompt的标签 
+    prompt_label = {  
+                    "体育":"体育", "军事":"军事", "农业":"农业",  "国际":"国际", 
+                    "娱乐":"娱乐", "房产":"房产", "故事":"故事",  "教育":"教育",
+                    "文化":"文化", "旅游":"旅游", "汽车":"汽车",  "电竞":"电竞", 
+                    "科技":"科技", "股票":"股票", "财经":"财经"
+                    }
+    
+    #不同的prompt会影响模型效果
+    #prompt = "这一句描述{}的内容如下："
+    prompt = "下面是一则关于{}的新闻："
+                    
+    model = TCBertPipelines(args, model_path=pretrained_model_path, nlabels=len(prompt_label))
+
+    if args.train:
+        model.train(train_data, dev_data, prompt, prompt_label)
+    result = model.predict(test_data, prompt, prompt_label)
+
+    for i, line in enumerate(result):
+        print({"content":test_data[i]["content"], "label":list(prompt_label.keys())[line]})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/translate/README.md b/fengshen/examples/translate/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fec3d5c0142aa08375927aba65bbd7c60187ff06
--- /dev/null
+++ b/fengshen/examples/translate/README.md
@@ -0,0 +1,65 @@
+# translation examples
+## 数据预处理
+
+数据预处理部分目前整合不多，主要提供一个最终的转换文件，转换成模型应用格式，前期还是应用mose等工具进行数据的预处理部分，产出处理后的源目标语言和目标语言两份数据，再调用本脚本合并
+
+前期数据预处理脚本可参考，deltalm在fairseq中的demo，prepare_iwslt14.sh：https://github.com/microsoft/unilm/blob/master/deltalm/examples/prepare_iwslt14.sh)
+
+### 目标格式
+需要将翻译的源语言和目标语言转换到一个文件中，格式如下：
+src为源语言，tgt为目标语言，每一行都是一个json格式
+```
+{"src": "und was menschliche gesundheit ist , kann auch ziemlich kompliziert sein .", "tgt": "and it can be a very complicated thing , what human health is ."}    
+{"src": "nun , warum spielt das eine rolle für die menschliche gesundheit ?", "tgt": "now why does that matter for human health ?"}    
+{"src": "das ist ein bild der cannery row von 1932 .", "tgt": "this is a shot of cannery row in 1932 ."}
+```
+### 处理脚本
+
+目前的finetue数据主要是通过deltalm的提供的实现，通过脚本转换成封神数据格式
+
+当前的转换脚本只是简单的将源语言和目标语言合并到一个文件，并生成上述格式，后续会继续完善处理脚本
+
+脚本路径：Fengshenbang-LM/fengshen/examples/translate/prepare_dataset.py
+
+
+使用方式：
+```
+python prepare_dataset.py processed_data_path de-en
+```
+
+## deltalm 模型
+
+### deltalm模型路径
+1) https://huggingface.co./IDEA-CCNL/Randeng-Deltalm-362M-En-Zn <br>
+2) https://huggingface.co./IDEA-CCNL/Randeng-Deltalm-362M-Zh-En
+
+主要包含三个文件：    
+config.json：模型配置文件   
+pytorch_model.bin：模型文件    
+spm.model：sentence_piece文件    
+
+### deltalm 模型结构
+均实现在 Fengshenbang-LM/fengshen/models/deltalm 路径下，文件结构如下：    
+1） modeling_deltalm.py 实现模型的基本结构，结构如论文所示    
+2） tokenizer_deltalm.py 实现模型的tokenzier部分    
+3） configuration_deltalm.py 实现模型的config配置部分    
+
+### finetune 德译英示例
+主要实现代码在 Fengshenbang-LM/fengshen/examples/translate/finetune_deltalm.py
+通过脚本调用即可， 参考脚本 Fengshenbang-LM/fengshen/examples/translate/finetune_deltalm.sh
+
+使用示例：
+```
+bash -x finetune_deltalm.sh 
+```
+
+注：如果要使用label_smoothing，当前需要设置label_smoothing参数不为0，当前默认值为0.1。直接在finetune_deltalm.sh里修改参数值即可
+
+## 运行环境
+
+pyhton = 3.8.10    
+pytorch = 1.10.0    
+transformers = 4.20.1    
+pytorch-lightning = 1.6.5   
+
+相关环境安装可参考Wiki：http://wiki.team.idea.edu.cn/pages/viewpage.action?pageId=16291924
diff --git a/fengshen/examples/translate/finetune_deltalm.py b/fengshen/examples/translate/finetune_deltalm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d19dd1ca4a5f920dcb90863e89940f05362e2cda
--- /dev/null
+++ b/fengshen/examples/translate/finetune_deltalm.py
@@ -0,0 +1,449 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+import pandas as pd
+import json
+import argparse
+import torch
+import os
+import logging
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from pytorch_lightning.utilities import rank_zero_info
+from sacrebleu.metrics import BLEU
+from fengshen.utils.utils import chinese_char_tokenize
+from fengshen.models.model_utils import add_module_args, add_inverse_square_args
+from fengshen.models.deltalm.tokenizer_deltalm import DeltalmTokenizer
+from fengshen.models.deltalm.modeling_deltalm import DeltalmForConditionalGeneration
+from fengshen.utils import UniversalCheckpoint
+from fengshen.data.universal_datamodule import UniversalDataModule
+from pytorch_lightning import Trainer, loggers, LightningModule
+from pytorch_lightning.callbacks import LearningRateMonitor
+from mosestokenizer import MosesDetokenizer
+from typing import List
+import sys
+sys.path.append('../../../')
+
+# from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+# from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+
+mose_decode = MosesDetokenizer()
+
+os.environ["CUDA_VISIBLE_DEVICES"] = '4'
+logger = logging.getLogger(__name__)
+
+EVAL_BLEU_ORDER = 4
+
+
+def calc_bleu_from_stats(sentence_stats: pd.DataFrame) -> BLEU:
+    corpus_stats = sentence_stats.sum(axis=0)
+    smooth = {"smooth_method": "exp"}
+    corpus_bleu = BLEU.compute_bleu(
+        correct=[
+            corpus_stats.correct_1_grams,
+            corpus_stats.correct_2_grams,
+            corpus_stats.correct_3_grams,
+            corpus_stats.correct_4_grams,
+        ],
+        total=[
+            corpus_stats.total_1_grams,
+            corpus_stats.total_2_grams,
+            corpus_stats.total_3_grams,
+            corpus_stats.total_4_grams,
+        ],
+        sys_len=corpus_stats.translation_length,
+        ref_len=corpus_stats.reference_length,
+        **smooth
+    )
+    return corpus_bleu
+
+
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    # logger.debug("Debug: After target.dim() == lprobs.dim(): ", target.dim(), lprobs.dim())
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.0)
+        smooth_loss.masked_fill_(pad_mask, 0.0)
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+    if reduce:
+        nll_loss = nll_loss.sum()
+        smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / (lprobs.size(-1) - 1)
+    valid_length = target.ne(ignore_index).sum()
+    # unvalid_length = target.eq(ignore_index).sum()
+    loss = ((1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss) / valid_length.item()
+
+    return loss, nll_loss
+
+
+class DataCollator:
+    def __init__(self, model, tokenizer, max_enc_length, max_dec_length, reverse_src_tgt):
+        self.tokenizer = tokenizer
+        self.max_enc_length = max_enc_length
+        self.max_dec_length = max_dec_length
+        self.model = model
+        self.reverse_src_tgt = reverse_src_tgt
+
+    def __call__(self, batch_samples):
+        batch_inputs, batch_targets = [], []
+        for sample in batch_samples:
+            if self.reverse_src_tgt:
+                if "tgt" in sample and len(sample["tgt"]) != 0:
+                    batch_inputs.append(sample["tgt"])
+                    batch_targets.append(sample["src"])
+            else:
+                if "src" in sample and len(sample["src"]) != 0:
+                    batch_inputs.append(sample["src"])
+                    batch_targets.append(sample["tgt"])
+        batch_data = self.tokenizer(
+            batch_inputs,
+            padding='max_length',
+            max_length=self.max_enc_length,
+            truncation=True,
+            return_tensors="pt"
+        )
+        with self.tokenizer.as_target_tokenizer():
+            labels = self.tokenizer(
+                batch_targets,
+                padding='max_length',
+                max_length=self.max_dec_length,
+                truncation=False,
+                return_tensors="pt"
+            )["input_ids"]
+            batch_data['decoder_input_ids'] = self.model.prepare_decoder_input_ids_from_labels(labels)
+            batch_data['labels'] = labels
+
+        batch_data['src'] = batch_inputs
+        batch_data['tgt'] = batch_targets
+
+        # logger.debug(batch_data)
+        return batch_data
+
+
+class FinetuneTranslation(LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('deltalm-base finetune')
+        parser.add_argument('--label_smoothing', default=0.1, type=float)
+        return parent_args
+
+    def __init__(self, args, tokenizer=None):
+        super().__init__()
+        self.args = args
+        self.save_hyperparameters(args)
+        if args.other_model:
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(args.model_path)
+        else:
+            self.model = DeltalmForConditionalGeneration.from_pretrained(args.model_path, ignore_mismatched_sizes=True)
+        self.tokenizer = tokenizer
+        assert self.tokenizer, "tokenizer is None!"
+        self.blue_metric = BLEU()
+        self.sufficient_stats: List[List[int]] = []
+        self.label_smoothing = self.args.label_smoothing
+        self.mose_decode = MosesDetokenizer()
+
+        if self.args.label_smoothing != 0:
+            self.loss_fn = label_smoothed_nll_loss
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus)
+            ab_size = self.trainer.accumulate_grad_batches * float(
+                self.trainer.max_epochs)
+            self.total_steps = (len(train_loader.dataset) //
+                                tb_size) // ab_size
+
+    def configure_optimizers(self):
+        # if self.args.use_default_configure:
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+    def training_step(self, batch, batch_idx):
+        if self.label_smoothing == 0:
+            output = self.model(input_ids=batch['input_ids'],
+                                attention_mask=batch['attention_mask'],
+                                labels=batch['labels'])
+
+            self.log('train_loss', output.loss, sync_dist=True)
+            return output.loss
+
+        # TODO label_smoothing should be implemented at here
+        else:
+            labels = batch["labels"]
+            output = self.model(input_ids=batch['input_ids'],
+                                attention_mask=batch['attention_mask'],
+                                decoder_input_ids=batch['decoder_input_ids'])
+
+            logits = output["logits"]
+            m = torch.nn.LogSoftmax(dim=-1)
+            lprobs = m(logits.float())
+            loss, _ = self.loss_fn(lprobs.view(-1, lprobs.size(-1)), labels.view(-1),
+                                   self.label_smoothing, self.tokenizer.pad_token_id)
+            self.log('train_loss', loss, sync_dist=True)
+            return loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1, ))
+
+        y_true = labels.view(size=(-1, ))
+        pad_mask = y_true.eq(1)
+        valid_length = y_true.ne(1).sum()
+
+        corr = torch.eq(y_pred, y_true.float())
+        corr.masked_fill_(pad_mask, 0.0)
+        acc = torch.sum(corr.float()) / valid_length
+        return acc
+
+    def get_sufficient_stats(self, translations: List[str], references: List[str]) -> pd.DataFrame:
+        assert len(translations) == len(references), (
+            f"There are {len(translations)} translated sentences "
+            f"but {len(references)} reference sentences"
+        )
+
+        # for sentence, ref in zip(translations, references):
+
+        sentence_bleu = self.blue_metric.corpus_score(translations, [references])
+        self.sufficient_stats.append(
+            [
+                # Number of correct 1-grams, .., 4-grams
+                sentence_bleu.counts[0],
+                sentence_bleu.counts[1],
+                sentence_bleu.counts[2],
+                sentence_bleu.counts[3],
+                # Total number of 1-grams, .., 4-grams
+                sentence_bleu.totals[0],
+                sentence_bleu.totals[1],
+                sentence_bleu.totals[2],
+                sentence_bleu.totals[3],
+                # Length of translated sentence.
+                sentence_bleu.sys_len,
+                # Length of reference sentence.
+                sentence_bleu.ref_len,
+            ]
+        )
+
+    def on_validation_start(self) -> None:
+        # rm file at validation start
+        prefix, ext = os.path.splitext(self.hparams.output_save_path)
+        file_path_rank = '{}_{}{}'.format(
+            prefix,
+            self.trainer._accelerator_connector.cluster_environment.
+            global_rank(), ext)
+        if os.path.exists(file_path_rank):
+            # logger.debug('rm {}'.format(file_path_rank))
+            os.remove(file_path_rank)
+
+    def validation_step(self, batch, batch_idx):
+
+        def postprocess_text(preds, labels, tgt_zh):
+            if tgt_zh:
+                preds = [pred.strip() for pred in preds]
+                labels = [label.strip() for label in labels]
+            else:
+                preds = list(map(lambda x: mose_decode(x.strip().split()), preds))
+                labels = list(map(lambda x: mose_decode(x.strip().split()), labels))
+            return preds, labels
+
+        tmp_label = batch['labels']
+        end_token_index = torch.where(tmp_label == self.tokenizer.eos_token_id)[1]
+        for idx, end_idx in enumerate(end_token_index):
+            tmp_label[idx][end_idx+1:] = -100
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'],
+                            labels=tmp_label)
+        generated_ids = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            max_length=self.hparams.max_dec_length)
+
+        preds = self.tokenizer.batch_decode(generated_ids,
+                                            skip_special_tokens=True)
+        labels = torch.where(batch['labels'] != -100, batch['labels'],
+                             self.tokenizer.pad_token_id)
+
+        labels = self.tokenizer.batch_decode(labels,
+                                             skip_special_tokens=True)
+
+        decoded_preds, decoded_labels = postprocess_text(preds, labels, self.args.tgt_zh)
+        # save preds for every rank
+        prefix, ext = os.path.splitext(self.hparams.output_save_path)
+        file_path_rank = '{}_{}{}'.format(
+            prefix,
+            self.trainer._accelerator_connector.cluster_environment.
+            global_rank(), ext)
+        self.save_prediction_to_file(preds=decoded_preds,
+                                     sources=batch['src'],
+                                     targets=decoded_labels,
+                                     ori_target=batch['tgt'],
+                                     file_path=file_path_rank)
+
+        if self.args.tgt_zh:
+            new_preds = [chinese_char_tokenize(p) for p in decoded_preds]
+            new_labels = [chinese_char_tokenize(label) for label in decoded_labels]
+            self.get_sufficient_stats(new_preds, new_labels)
+        else:
+            self.get_sufficient_stats(decoded_preds, decoded_labels)
+        # batch_bleu = self.blue_metric.corpus_score(decoded_preds, [decoded_labels]).score
+        acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss, sync_dist=True)
+        self.log('val_acc', acc, sync_dist=True)
+
+    def validation_epoch_end(self, outputs):
+        rank_zero_info("***** Validation results *****")
+        sentence_states = pd.DataFrame(
+            self.sufficient_stats,
+            columns=[
+                "correct_1_grams",
+                "correct_2_grams",
+                "correct_3_grams",
+                "correct_4_grams",
+                "total_1_grams",
+                "total_2_grams",
+                "total_3_grams",
+                "total_4_grams",
+                "translation_length",
+                "reference_length",
+            ]
+        )
+
+        computed_bleu = calc_bleu_from_stats(sentence_states)
+        rank_zero_info("valid_sacrebleu= {}\n".format(computed_bleu.score))
+        self.log('valid_sacrebleu', computed_bleu.score, sync_dist=True)
+        self.sufficient_stats = []
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        if self.trainer._accelerator_connector.cluster_environment.global_rank(
+        ) == 0:
+            self.model.save_pretrained(
+                os.path.join(
+                    self.trainer.checkpoint_callback.dirpath,
+                    'finetuned_epoch{}_step{}'.format(
+                        checkpoint['epoch'], checkpoint['global_step'])))
+
+    def save_prediction_to_file(self, preds, sources, targets, ori_target, file_path):
+        with open(file_path, 'a', encoding='utf-8') as f:
+            for idx, pred in enumerate(preds):
+                source = sources[idx]
+                target = targets[idx]
+                tmp_result = dict()
+                tmp_result['pred'] = pred
+                tmp_result['source'] = source
+                tmp_result['label'] = target
+                tmp_result['ori_label'] = ori_target[idx]
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data + '\n')
+
+    def test_step(self, batch, batch_idx):
+        # print(batch)
+        texts = batch['src']
+        # output summary and metrics
+        self.model.eval()
+        generated_ids = self.model.generate(
+            input_ids=batch['input_ids'],
+            attention_mask=batch['attention_mask'],
+            max_length=self.hparams.max_dec_length
+        )
+        preds = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+        labels = torch.where(batch['labels'] != -100, batch['labels'],
+                             self.tokenizer.pad_token_id)
+        labels = self.tokenizer.batch_decode(
+            labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+        self.save_prediction_to_file(preds, texts, labels, self.hparams.output_save_path)
+
+
+def configure_logger(logging_lever=logging.INFO):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging_lever)
+
+
+def main():
+    args_parser = argparse.ArgumentParser("Pegasus Task")
+    args_parser.add_argument('--do_eval_only',
+                             action='store_true',
+                             default=False)
+    args_parser.add_argument('--other_model',
+                             action='store_true',
+                             default=False)
+    args_parser.add_argument('--reverse_src_tgt',
+                             action='store_true',
+                             default=False)
+    args_parser.add_argument('--tgt_zh',
+                             action='store_true',
+                             default=False)
+    args_parser.add_argument('--early_stopping_callback',
+                             action='store_true',
+                             default=False)
+    args_parser.add_argument('--pretrained_model_path',
+                             default='facebook/mbart',
+                             type=str)
+    args_parser.add_argument('--output_save_path',
+                             default='predict.json',
+                             type=str)
+    args_parser.add_argument('--max_enc_length', default=512, type=int)
+    args_parser.add_argument('--max_dec_length', default=512, type=int)
+
+    # * Args for data preprocessing
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+
+    # * Args for training
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args_parser = FinetuneTranslation.add_model_specific_args(args_parser)
+    args_parser = add_module_args(args_parser)
+    args_parser = add_inverse_square_args(args_parser)
+
+    args = args_parser.parse_args()
+
+    if args.other_model:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    else:
+        tokenizer = DeltalmTokenizer.from_pretrained(args.model_path)
+    # tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    print("tokenizer vocab size: ", tokenizer.vocab_size)
+    model = FinetuneTranslation(args, tokenizer)
+    collator = DataCollator(model.model, tokenizer, args.max_enc_length, args.max_dec_length, args.reverse_src_tgt)
+    data_model = UniversalDataModule(tokenizer=tokenizer,
+                                     args=args,
+                                     #  datasets=dataset,
+                                     collate_fn=collator)
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+
+    configure_logger(logging_lever=logging.INFO)
+
+    if not args.do_eval_only:
+
+        lr_monitor = LearningRateMonitor(logging_interval='step')
+        tensorboard_logger = loggers.TensorBoardLogger(
+            save_dir=os.path.join(args.default_root_dir, 'logs/'),
+            name=os.path.basename(os.path.dirname(args.model_path)))
+        checkpoint_callback = UniversalCheckpoint(args)
+        # early_stop = EarlyStopping(monitor=args.monitor, mode=args.mode)
+        trainer = Trainer.from_argparse_args(
+            args, logger=tensorboard_logger, callbacks=[lr_monitor, checkpoint_callback])
+        trainer.fit(model, data_model)
+
+    else:
+        trainer = Trainer.from_argparse_args(args)
+        trainer.validate(model, data_model)
+        # trainer.test(model, data_model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/translate/finetune_deltalm.sh b/fengshen/examples/translate/finetune_deltalm.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6d6bd9ef5fde6c9afd2957b79118e13b4e94d8da
--- /dev/null
+++ b/fengshen/examples/translate/finetune_deltalm.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbart_en_zh
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8              # number of gpus
+#SBATCH --cpus-per-task=32
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+
+MODEL_NAME=deltalm_en_zh
+MICRO_BATCH_SIZE=16
+ROOT_DIR=../../workspace
+MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME}
+
+
+if [ ! -d ${MODEL_ROOT_DIR} ];then
+  mkdir ${MODEL_ROOT_DIR}
+  echo ${MODEL_ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${MODEL_ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+output_save_path=${MODEL_ROOT_DIR}.json
+if [ -f ${output_save_path} ];then
+  echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!!
+  rm ${output_save_path}
+fi
+
+ZERO_STAGE=1
+
+config_json="${MODEL_ROOT_DIR}/ds_config.${MODEL_NAME}.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+  "steps_per_print": 1000,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": false
+  },
+  "zero_allow_untested_optimizer": false,
+  "fp16": {
+    "enabled": true
+  },
+  "wall_clock_breakdown": false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+
+TRAINER_ARGS="
+    --max_epochs 20 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy deepspeed_stage_${ZERO_STAGE} \
+    --default_root_dir ${MODEL_ROOT_DIR} \
+    --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \
+    --save_top_k 3 \
+    --monitor valid_sacrebleu \
+    --mode max \
+    --save_last \
+    --every_n_train_steps 0 \
+    --val_check_interval 0.2 \
+    --label_smoothing 0.1 \
+    --warmup_steps 4000 \
+    --learning_rate 1e-7 \
+    --adam_beta2 0.98 \
+    --scheduler_type inverse_sqrt \
+    --reverse_src_tgt \
+    --tgt_zh \
+"
+
+DATA_ARGS="
+    --datasets_name case_test \
+    --num_workers 8 \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --val_batchsize $MICRO_BATCH_SIZE \
+    --test_batchsize $MICRO_BATCH_SIZE \
+    --val_datasets_field val \
+    --max_enc_length 256 \
+    --max_dec_length 256 \
+"
+
+mode_path="IDEA-CCNL/Randeng-Deltalm-362M-En-Zn"
+
+
+MODEL_ARGS="
+    --model_path  $mode_path \
+    --output_save_path $output_save_path \
+"
+
+SCRIPTS_PATH=finetune_deltalm.py
+
+cat $SCRIPTS_PATH
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+source activate 
+conda activate fengshen
+# srun python3 $CMD
+python3 $CMD
diff --git a/fengshen/examples/translate/prepare_dataset.py b/fengshen/examples/translate/prepare_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ce8cc74e05ab477a5863b99470c30c4073876c8
--- /dev/null
+++ b/fengshen/examples/translate/prepare_dataset.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import json
+import os
+
+
+def main(file_path, src_lang, tgt_lang):
+
+    file_list = ["train", "valid", "test"]
+    for filename in file_list:
+        sys.stderr.write("**** Start processing {} ... ****\n".format(filename))
+        src_full_path = os.path.join(file_path, ".".join((filename, src_lang)))
+        tgt_full_path = os.path.join(file_path, ".".join((filename, tgt_lang)))
+        src_reader = open(src_full_path, 'r')
+        tgt_reader = open(tgt_full_path, "r")
+
+        writer_full_path = os.path.join(file_path, ".".join((filename, src_lang + "_" + tgt_lang)))
+        writer = open(writer_full_path, "w")
+        # combine_dict = OrderedDict()
+        for row_src, row_tgt in zip(src_reader, tgt_reader):
+            combine_line = {}
+            combine_line["src"] = row_src.strip()
+            combine_line["tgt"] = row_tgt.strip()
+            json.dump(combine_line, writer, ensure_ascii=False)
+            writer.write('\n')
+            # print(row_src)
+            # print(row_tgt)
+        sys.stderr.write(f"**** Done change {filename} format **** \n")
+
+
+if __name__ == "__main__":
+    file_path = sys.argv[1]
+    src_lang, tgt_lang = sys.argv[2].split("-")
+
+    main(file_path, src_lang, tgt_lang)
diff --git a/fengshen/examples/ubert/README.md b/fengshen/examples/ubert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdad2ca0d948830c51bf141dceb907c4531a4690
--- /dev/null
+++ b/fengshen/examples/ubert/README.md
@@ -0,0 +1,280 @@
+# Ubert: 统一 NLU 任务新范式
+- 论文：[https://arxiv.org/pdf/2206.12094.pdf](https://arxiv.org/pdf/2206.12094.pdf)
+- 知乎：[https://zhuanlan.zhihu.com/p/539958182?](https://zhuanlan.zhihu.com/p/539958182?)
+
+### 简介
+Ubert 是我们在做 [2022AIWIN 世界人工智能创新大赛：中文保险小样本多任务](http://ailab.aiwin.org.cn/competitions/68#results) 时提出的一种解决方案。并取得A/B榜榜首的成绩，且B榜综合成绩领先第二名超过 1 个百分点，领先第三名接近 5 个百分点。相比于官方提供的 baseline，提高 20 个百分点。Ubert 不仅可以完成 实体识别、事件抽取等常见抽取任务，还可以完成新闻分类、自然语言推理等分类任务，且所有任务是共享一个统一框架、统一任务、统一训练目标的模型。解题思路和方案可以参考我们的答辩PPT，或者参考我们的[知乎文章](https://zhuanlan.zhihu.com/p/539958182?)
+
+## 开源模型列表
+ 开源的模型是我们在比赛模型的基础上重新整理 70+ 份数据，共 100万+条样本，进行预训练而得到的，可直接开箱即用。开源模型地址如下：
+| 模型 | 地址   |
+|:---------:|:--------------:|
+| Erlangshen-Ubert-110M-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-110M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-110M-Chinese)       |
+| Erlangshen-Ubert-330M-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-330M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-330M-Chinese)   |
+
+
+## 快速开箱使用
+安装我们的 fengshen 框架，我们暂且提供如下方式安装
+```python
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+pip install --editable ./
+```
+
+一键运行下面代码得到预测结果, 你可以任意修改示例 text 和要抽取的 entity_type，体验一下 Zero-Shot 性能
+```python
+import argparse
+from fengshen import UbertPiplines
+
+total_parser = argparse.ArgumentParser("TASK NAME")
+total_parser = UbertPiplines.piplines_args(total_parser)
+args = total_parser.parse_args()
+
+test_data=[
+    {
+        "task_type": "抽取任务", 
+        "subtask_type": "实体识别", 
+        "text": "这也让很多业主据此认为，雅清苑是政府公务员挤对了国家的经适房政策。", 
+        "choices": [ 
+            {"entity_type": "小区名字"}, 
+            {"entity_type": "岗位职责"}
+            ],
+        "id": 0}
+]
+
+model = UbertPiplines(args)
+result = model.predict(test_data)
+for line in result:
+    print(line)
+```
+
+## 继续 finetune 使用
+
+开源的模型我们已经经过大量的数据进行预训练而得到，可以直接进行 Zero-Shot，如果你还想继续finetune,可以参考我们的 [example.py](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/fengshen/examples/ubert/example.py)。你只需要将我们数据预处理成为我们定义的格式，即可使用简单的几行代码完成模型的训练和推理。我们是复用 pytorch-lightning 的 trainer 。在训练时，可以直接传入 trainer 的参数，此外我们还定义了一些其他参数。常用的参数如下：
+
+
+```sh
+--pretrained_model_path       #预训练模型的路径，默认
+--load_checkpoints_path       #加载模型的路径，如果你finetune完，想加载模型进行预测可以传入这个参数
+--batchsize                   #批次大小, 默认 8
+--monitor                     #保存模型需要监控的变量，例如我们可监控 val_span_acc
+--checkpoint_path             #模型保存的路径, 默认 ./checkpoint
+--save_top_k                  #最多保存几个模型, 默认 3
+--every_n_train_steps         #多少步保存一次模型, 默认 100
+--learning_rate               #学习率, 默认 2e-5
+--warmup                      #预热的概率, 默认 0.01
+--default_root_dir            #模型日子默认输出路径
+--gradient_clip_val           #梯度截断， 默认 0.25
+--gpus                        #gpu 的数量
+--check_val_every_n_epoch     #多少次验证一次， 默认 100
+--max_epochs                  #多少个 epochs， 默认 5
+--max_length                  #句子最大长度， 默认 512
+--num_labels                  #训练每条样本最多取多少个label，超过则进行随机采样负样本， 默认 10
+```
+
+## 数据预处理示例
+
+整个模型的 Piplines 我们已经写好，所以为了方便，我们定义了数据格式。目前我们在预训练中主要含有一下几种任务类型
+
+| task_type | subtask_type   |
+|:---------:|:--------------:|
+| 分类任务  | 文本分类       |
+|           | 自然语言推理   |
+|           | 情感分析       |
+|           | 多项式阅读理解 |
+| 抽取任务  | 实体识别       |
+|           | 事件抽取       |
+|           | 抽取式阅读理解 |
+|           | 关系抽取       |
+
+### 分类任务
+
+#### 普通分类任务
+对于分类任务，我们把类别描述当作是 entity_type，我们主要关注 label 字段，label为 1 表示该该标签是正确的标签。如下面示例所示
+```json
+{
+	"task_type": "分类任务",
+	"subtask_type": "文本分类",
+	"text": "7000亿美元救市方案将成期市毒药",
+	"choices": [{
+		"entity_type": "一则股票新闻",
+		"label": 1,
+		"entity_list": []
+	}, {
+		"entity_type": "一则教育新闻",
+		"label": 0,
+		"entity_list": []
+	}, {
+		"entity_type": "一则科学新闻",
+		"label": 0,
+		"entity_list": []
+	}],
+	"id": 0
+}
+
+```
+
+#### 自然语言推理
+```json
+{
+	"task_type": "分类任务",
+	"subtask_type": "自然语言推理",
+	"text": "在白云的蓝天下，一个孩子伸手摸着停在草地上的一架飞机的螺旋桨。",
+	"choices": [{
+		"entity_type": "可以推断出：一个孩子正伸手摸飞机的螺旋桨。",
+		"label": 1,
+		"entity_list": []
+	}, {
+		"entity_type": "不能推断出：一个孩子正伸手摸飞机的螺旋桨。",
+		"label": 0,
+		"entity_list": []
+	}, {
+		"entity_type": "很难推断出：一个孩子正伸手摸飞机的螺旋桨。",
+		"label": 0,
+		"entity_list": []
+	}],
+	"id": 0
+}
+```
+
+
+#### 语义匹配
+
+```json
+{
+	"task_type": "分类任务",
+	"subtask_type": "语义匹配",
+	"text": "不要借了我是试试看能否操作的",
+	"choices": [{
+		"entity_type": "不能理解为：借款审核期间能否取消借款",
+		"label": 1,
+		"entity_list": []
+	}, {
+		"entity_type": "可以理解为：借款审核期间能否取消借款",
+		"label": 0,
+		"entity_list": []
+	}],
+	"id": 0
+}
+
+```
+
+### 抽取任务
+对于抽取任务，label 字段是无效的
+#### 实体识别
+```json
+{
+	"task_type": "抽取任务",
+	"subtask_type": "实体识别",
+	"text": "彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，",
+	"choices": [{
+		"entity_type": "地址",
+		"label": 0,
+		"entity_list": [{
+			"entity_name": "台湾",
+			"entity_type": "地址",
+			"entity_idx": [
+				[15, 16]
+			]
+		}]
+	}{
+		"entity_type": "政府机构",
+		"label": 0,
+		"entity_list": []
+	}, {
+		"entity_type": "电影名称",
+		"label": 0,
+		"entity_list": []
+	}, {
+		"entity_type": "人物姓名",
+		"label": 0,
+		"entity_list": [{
+			"entity_name": "彭小军",
+			"entity_type": "人物姓名",
+			"entity_idx": [
+				[0, 2]
+			]
+		}]
+	},
+	"id": 0
+}
+
+```
+#### 事件抽取
+```json
+
+{
+	"task_type": "抽取任务",
+	"subtask_type": "事件抽取",
+	"text": "小米9价格首降，6GB+128GB跌了200，却不如红米新机值得买",
+	"choices": [{
+		"entity_type": "降价的时间",
+		"label": 0,
+		"entity_list": []
+	}, {
+		"entity_type": "降价的降价方",
+		"label": 0,
+		"entity_list": []
+	}, {
+		"entity_type": "降价的降价物",
+		"label": 0,
+		"entity_list": [{
+			"entity_name": "小米9",
+			"entity_type": "降价的降价物",
+			"entity_idx": [
+				[0, 2]
+			]
+		}, {
+			"entity_name": "小米9",
+			"entity_type": "降价的降价物",
+			"entity_idx": [
+				[0, 2]
+			]
+		}]
+	}, {
+		"entity_type": "降价的降价幅度",
+		"label": 0,
+		"entity_list": []
+	}],
+	"id": 0
+}
+```
+#### 抽取式阅读理解
+
+```json
+{
+	"task_type": "抽取任务",
+	"subtask_type": "抽取式阅读理解",
+	"text": "截至2014年7月1日，圣地亚哥人口估计为1381069人，是美国第八大城市，加利福尼亚州第二大城市。它是圣迭戈-蒂华纳城市群的一部分，是美国与底特律-温莎之后的第二大跨境城市群，人口4922723。圣地亚哥是加州的出生地，以全年温和的气候、天然的深水港、广阔的海滩、与美国海军的长期联系以及最近作为医疗和生物技术发展中心而闻名。",
+	"choices": [{
+		"entity_type": "除了医疗保健，圣迭戈哪个就业部门已经强势崛起？",
+		"label": 0,
+		"entity_list": [{
+			"entity_name": "生物技术发展",
+			"entity_idx": [
+				[153, 158]
+			]
+		}]
+	}, {
+		"entity_type": "在所有的军事部门中，哪一个在圣地亚哥的存在最为强大？",
+		"label": 0,
+		"entity_list": [{
+			"entity_name": "美国海军",
+			"entity_idx": [
+				[135, 138]
+			]
+		}]
+	}, {
+		"entity_type": "在美国十大城市中，圣迭戈排名哪一位？",
+		"label": 0,
+		"entity_list": [{
+			"entity_name": "第八",
+			"entity_idx": [
+				[33, 34]
+			]
+		}]
+	}],
+	"id": 0
+}
+```
+
diff --git a/fengshen/examples/ubert/example.py b/fengshen/examples/ubert/example.py
new file mode 100644
index 0000000000000000000000000000000000000000..bedd365ff67ff5d9b1f8f22777dab9b5a8b02394
--- /dev/null
+++ b/fengshen/examples/ubert/example.py
@@ -0,0 +1,95 @@
+import argparse
+from fengshen import UbertPipelines
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '6'
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser = UbertPipelines.pipelines_args(total_parser)
+    args = total_parser.parse_args()
+
+    # 设置一些训练要使用到的参数
+    args.pretrained_model_path = 'IDEA-CCNL/Erlangshen-Ubert-110M-Chinese' #预训练模型的路径，我们提供的预训练模型存放在HuggingFace上
+    args.default_root_dir = './'  #默认主路径，用来放日志、tensorboard等
+    args.max_epochs = 5
+    args.gpus = 1
+    args.batch_size = 1
+
+    # 只需要将数据处理成为下面数据的 json 样式就可以一键训练和预测，下面只是提供了一条示例样本
+    train_data = [
+        {
+            "task_type": "抽取任务",
+            "subtask_type": "实体识别",
+            "text": "彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，",
+            "choices": [
+                {"entity_type": "地址", "label": 0, "entity_list": [
+                    {"entity_name": "台湾", "entity_type": "地址", "entity_idx": [[15, 16]]}]},
+                {"entity_type": "书名", "label": 0, "entity_list": []},
+                {"entity_type": "公司", "label": 0, "entity_list": []},
+                {"entity_type": "游戏", "label": 0, "entity_list": []},
+                {"entity_type": "政府机构", "label": 0, "entity_list": []},
+                {"entity_type": "电影名称", "label": 0, "entity_list": []},
+                {"entity_type": "人物姓名", "label": 0, "entity_list": [
+                    {"entity_name": "彭小军", "entity_type": "人物姓名", "entity_idx": [[0, 2]]}]},
+                {"entity_type": "组织机构", "label": 0, "entity_list": []},
+                {"entity_type": "岗位职位", "label": 0, "entity_list": []},
+                {"entity_type": "旅游景点", "label": 0, "entity_list": []}
+            ],
+            "id": 0}
+    ]
+    dev_data = [
+        {
+            "task_type": "抽取任务",
+            "subtask_type": "实体识别",
+            "text": "就天涯网推出彩票服务频道是否是业内人士所谓的打政策“擦边球”，记者近日对此事求证彩票监管部门。",
+            "choices": [
+                {"entity_type": "地址", "label": 0, "entity_list": []},
+                {"entity_type": "书名", "label": 0, "entity_list": []},
+                {"entity_type": "公司", "label": 0, "entity_list": [
+                    {"entity_name": "天涯网", "entity_type": "公司", "entity_idx": [[1, 3]]}]},
+                {"entity_type": "游戏", "label": 0, "entity_list": []},
+                {"entity_type": "政府机构", "label": 0, "entity_list": []},
+                {"entity_type": "电影名称", "label": 0, "entity_list": []},
+                {"entity_type": "人物姓名", "label": 0, "entity_list": []},
+                {"entity_type": "组织机构", "label": 0, "entity_list": [
+                    {"entity_name": "彩票监管部门", "entity_type": "组织机构", "entity_idx": [[40, 45]]}]},
+                {"entity_type": "岗位职位", "label": 0, "entity_list": [
+                    {"entity_name": "记者", "entity_type": "岗位职位", "entity_idx": [[31, 32]]}]},
+                {"entity_type": "旅游景点", "label": 0, "entity_list": []}
+            ],
+
+            "id": 0}
+
+    ]
+    test_data = [
+        {
+            "task_type": "抽取任务",
+            "subtask_type": "实体识别",
+            "text": "这也让很多业主据此认为，雅清苑是政府公务员挤对了国家的经适房政策。",
+            "choices": [
+                {"entity_type": "地址", "label": 0, "entity_list": [
+                    {"entity_name": "雅清苑", "entity_type": "地址", "entity_idx": [[12, 14]]}]},
+                {"entity_type": "书名", "label": 0, "entity_list": []},
+                {"entity_type": "公司", "label": 0, "entity_list": []},
+                {"entity_type": "游戏", "label": 0, "entity_list": []},
+                {"entity_type": "政府机构", "label": 0, "entity_list": []},
+                {"entity_type": "电影名称", "label": 0, "entity_list": []},
+                {"entity_type": "人物姓名", "label": 0, "entity_list": []},
+                {"entity_type": "组织机构", "label": 0, "entity_list": []},
+                {"entity_type": "岗位职位", "label": 0, "entity_list": [
+                    {"entity_name": "公务员", "entity_type": "岗位职位", "entity_idx": [[18, 20]]}]},
+                {"entity_type": "旅游景点", "label": 0, "entity_list": []}
+            ],
+            "id": 0},
+    ]
+
+    model = UbertPipelines(args)
+    model.fit(train_data, dev_data)
+    result = model.predict(test_data)
+    for line in result:
+        print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/unimc/README.md b/fengshen/examples/unimc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..16abf3ff69c5ab7b8b8ca1f7c7ec191cbdf64ec0
--- /dev/null
+++ b/fengshen/examples/unimc/README.md
@@ -0,0 +1,221 @@
+[**中文**](./README.md) | [**English**](./README_en.md)
+# UniMC
+
+EMNLP 2022 论文 《[Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective](https://arxiv.org/abs/2210.08590)》源码
+
+![](./unimc.jpg)
+
+## Update
+- [2022-10-18] Release preprint in arXiv.
+- [2022-10-14] Release code in GitHub.
+
+## Requirements
+
+安装 fengshen 框架
+
+```shell
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+pip install --editable .
+```
+
+## Quick Start
+
+你可以参考我们的 [example.py](./example.py) 脚本，只需要将处理好的 train、dev、test 即输入模型即可。
+```python
+import argparse
+from fengshen.pipelines.multiplechoice import UniMCPipelines
+
+total_parser = argparse.ArgumentParser("TASK NAME")
+total_parser = UniMCPipelines.piplines_args(total_parser)
+args = total_parser.parse_args()
+    
+pretrained_model_path = 'IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese'
+args.learning_rate=2e-5
+args.max_length=512
+args.max_epochs=3
+args.batchsize=8
+args.default_root_dir='./'
+model = UniMCPipelines(args,model_path=pretrained_model_path)
+
+train_data = [] 
+dev_data = [] 
+test_data = [{
+	"texta": "就是废物，充电不进害得老子把主板烧了，客服不耐烦",
+	"textb": "",
+	"question": "",
+	"choice": ["这是一条差评", "这是一条好评"],
+	"answer": "这是一条差评",
+	"label": 0,
+	"id": 31
+}]
+
+if args.train:
+	model.train(train_data, dev_data)
+result = model.predict(test_data)
+```
+## Pretrained Model
+对于英文模型，我们使用14份 multiplechoice 数据集进行了预训练。在中文模型中，我们已经收集了48份数据集对模型进行预训练，我们已经将预训练模型开源到 HuggingFace 社区当中。
+
+| 模型 | 地址   |
+|:---------:|:--------------:|
+| Erlangshen-UniMC-Albert-235M-English  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English)   |
+| Erlangshen-UniMC-RoBERTa-110M-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese)       |
+| Erlangshen-UniMC-RoBERTa-330M-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese)   |
+| Erlangshen-UniMC-MegatronBERT-1.3B-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese)       |
+
+## Experiments
+
+
+### English
+
+为了测评 UniMC 的性能，在英文中，我们使用 14份 multiple-choice 数据集（具体数据参考原论文）来对模型进行预训练，使其具备做选择题的能力，
+
+**Zero-shot**
+| Model   | T0 11B | GLaM 60B | FLAN 137B | PaLM 540B | UniMC 235M |
+|---------|--------|----------|-----------|-----------|------------|
+| ANLI R1 | 43.6   | 40.9     | 47.7      | 48.4      | **52.0**         |
+| ANLI R2 | 38.7   | 38.2     | 43.9      | 44.2      | **44.4**       |
+| ANLI R3 | 41.3   | 40.9     | 47.0        | 45.7      | **47.8**       |
+| CB      | 70.1   | 33.9     | 64.1      | 51.8      | **75.7**       |
+### Chinese
+
+为了测评 UniMC 在中文场景下的性能我们使用 13份 有监督数据集来对模型进行预训练,预训练数据如下：
+| Task type   | Task | # of option | Data size |
+|---------|--------|----------|-----------|
+| Multiple-choice | c3   | 4     | 11.8k      |
+| Multiple-choice | ClozeT   | 2     | 0.7k      | 
+| Multiple-choice | CMRC2019   | n     | 11.4k        |
+| Multiple-choice      | GCRC   | 4     | 7.8k      |
+| Classification | DuEE-Fin   | 12     | 4.3k      |
+| Classification | DuEE1.0   | 65     | 10.3k      | 
+| Classification | Fudan   | 20     | 19.6k        |
+| Classification | THUNEWS   | 10     | 180k      |
+| NLI | CMNLI   | 3     | 39k      |
+| NLI | SNLI   | 3     | 545.8k      | 
+| Paraphrace | AFQMC   | 2     | 34.3k        |
+| Paraphrace | PAWS-X   | 2     | 49k      |
+| Paraphrace | STS-B   | 2     | 80k      |
+
+我们使用中文领域常用的benchmark来测试UniMC的性能，具体是FewCLUE的9个任务，我们在 test_public 上测评模型的性能。
+
+
+**Few-shot**
+| Model      | eprstmt    | csldcp   | tnews     | iflytek  | ocnli     | bustm     | chid      | csl      | wsc       | Avg       |
+|------------|------------|----------|-----------|----------|-----------|-----------|-----------|----------|-----------|-----------|
+| Finetuning | 65.4       | 35.5     | 49        | 32.8     | 33        | 60.7      | 14.9      | 50       | 55.6      | 44.1      |
+| PET        | 86.7       | 51.7     | 54.5      | 46       | 44        | 56        | 61.2      | 59.4     | 57.5      | 57.44     |
+| LM-BFF     | 85.6       | 54.4     | 53        | 47.1     | 41.6      | 57.6      | 61.2      | 51.7     | 54.7      | 56.32     |
+| P-tuning   | 88.3       | 56       | 54.2      | **57.6** | 41.9      | 60.9      | 59.3      | **62.9** | 58.1      | 59.91     |
+| EFL        | 84.9       | 45       | 52.1      | 42.7     | 66.2      | 71.8      | 30.9      | 56.6     | 53        | 55.91     |
+| [UniMC-RoBERTa-110M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese) | 88.64      | 54.08    | 54.32     | 48.6     | 66.55     | 73.76     | 67.71     | 52.54    | 59.92     | 62.86     |
+| [UniMC-RoBERTa-330M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese) | 89.53      | 57.3     | 54.25     | 50       | 70.59     | 77.49     | 78.09     | 55.73    | 65.16     | 66.46     |
+| [UniMC-MegatronBERT-1.3B](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese) | **89.278** | **60.9** | **57.46** | 52.89    | **76.33** | **80.37** | **90.33** | 61.73    | **79.15** | **72.05** |
+
+**Zero-shot**
+
+| Model         | eprstmt   | csldcp    | tnews     | iflytek   | ocnli     | bustm    | chid     | csl      | wsc       | Avg       |
+|---------------|-----------|-----------|-----------|-----------|-----------|----------|----------|----------|-----------|-----------|
+| GPT-zero      | 57.5      | 26.2      | 37        | 19        | 34.4      | 50       | 65.6     | 50.1     | 50.3      | 43.4      |
+| PET-zero      | 85.2      | 12.6      | 26.1      | 26.6      | 40.3      | 50.6     | 57.6     | 52.2     | 54.7      | 45.1      |
+| NSP-BERT      | 86.9      | 47.6      | 51        | 41.6      | 37.4      | 63.4     | 52       | **64.4** | 59.4      | 55.96     |
+| ZeroPrompt    | -         | -         | -         | 16.14     | 46.16     | -        | -        | -        | 47.98     | -         |
+|  Yuan1.0-13B  | 88.13     | 38.99     | 57.47     | 38.82     | 48.13     | 59.38    | 86.14    | 50       | 38.99     | 56.22     |
+| ERNIE3.0-240B | 88.75     | **50.97** | **57.83** | **40.42** | 53.57     | 64.38    | 87.13    | 56.25    | 53.46     | 61.41     |
+| [UniMC-RoBERTa-110M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese)    | 86.16     | 31.26     | 46.61     | 26.54     | 66.91     | 73.34    | 66.68    | 50.09    | 53.66     | 55.7      |
+| [UniMC-RoBERTa-330M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese)     | 87.5      | 30.4      | 47.6      | 31.5      | 69.9      | 75.9     | 78.17    | 49.5     | 60.55     | 59.01     |
+| [UniMC-MegatronBERT-1.3B](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese)     | **88.79** | 42.06     | 55.21     | 33.93     | **75.57** | **79.5** | **89.4** | 50.25    | **66.67** | **64.53** |
+
+
+
+## Dataset
+
+我们已经定义好了 UniMC 所需的数据格式，你只需要将数据转化为下面的数据格式即可：
+
+### 文本分类
+```json
+{
+    "texta": "街头偶遇2018款长安CS35，颜值美炸！或售6万起，还买宝骏510？",   
+    "textb": "", 
+    "question": "下面新闻属于哪一个类别？",   
+    "choice": [
+        "房产", 
+        "汽车", 
+        "教育",
+        "军事"
+        ], 
+    "answer": "汽车", 
+    "label": 1, 
+    "id": 7759
+}
+
+```
+
+### 情感分析
+```json
+{
+	"texta": "就是废物，充电不进害得老子把主板烧了，客服不耐烦",
+	"textb": "",
+	"question": "",
+	"choice": ["这是一条差评", "这是一条好评"],
+	"answer": "这是一条差评",
+	"label": 0,
+	"id": 31
+}
+
+```
+
+### 语义匹配
+```json
+{
+	"texta": "不要借了我是试试看能否操作的",
+	"textb": "",
+	"question": "",
+	"choice": ["不能理解为：借款审核期间能否取消借款", "可以理解为：借款审核期间能否取消借款"],
+	"answer": "不能理解为：借款审核期间能否取消借款",
+	"label": 0,
+	"id": 0
+}
+
+```
+
+### 自然语言推理
+```json
+{
+	"texta": "身上裹一件工厂发的棉大衣,手插在袖筒里",
+	"textb": "",
+	"question": "",
+	"choice": ["不能推断出：身上至少一件衣服", "很难推断出：身上至少一件衣服", "可以推断出：身上至少一件衣服"],
+	"answer": "可以推断出：身上至少一件衣服",
+	"label": 2,
+	"id": 0
+}
+
+```
+
+
+## Citation
+如果你觉得本仓库帮助到了你，你可以使用下面方式引用我们的工作
+
+```text
+@article{unimc,
+  author    = {Ping Yang and
+               Junjie Wang and
+               Ruyi Gan and
+               Xinyu Zhu and
+               Lin Zhang and
+               Ziwei Wu and
+               Xinyu Gao and
+               Jiaxing Zhang and
+               Tetsuya Sakai},
+  title     = {Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective},
+  journal   = {CoRR},
+  volume    = {abs/2210.08590},
+  year      = {2022}
+}
+```
+
+## License
+
+[Apache License 2.0](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/LICENSE)
+
diff --git a/fengshen/examples/unimc/README_en.md b/fengshen/examples/unimc/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a1e86888c5cfc3046527613f603f96729cdab08
--- /dev/null
+++ b/fengshen/examples/unimc/README_en.md
@@ -0,0 +1,104 @@
+[**中文**](./README.md) | [**English**](./README_en.md)
+# UniMC
+Code for  [Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective](https://arxiv.org/abs/2210.08590)
+
+
+
+![](./unimc.jpg)
+
+## Update
+- [2022-10-18] Release preprint in arXiv.
+- [2022-10-14] Release code in GitHub.
+
+## Requirements
+
+
+```shell
+git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git
+cd Fengshenbang-LM
+pip install --editable .
+```
+
+## Quick Start
+You can refer to our [example.py]()
+
+```python
+import argparse
+from fengshen.pipelines.multiplechoice import UniMCPipelines
+
+total_parser = argparse.ArgumentParser("TASK NAME")
+total_parser = UniMCPipelines.piplines_args(total_parser)
+args = total_parser.parse_args()
+    
+pretrained_model_path = 'IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English'
+args.language='english'
+args.learning_rate=2e-5
+args.max_length=512
+args.max_epochs=3
+args.batchsize=8
+args.default_root_dir='./'
+model = UniMCPipelines(args, model_path=pretrained_model_path)
+
+train_data = [] 
+dev_data = [] 
+test_data = [{
+	"texta": "it 's just incredibly dull .",
+	"textb": "",
+	"question": "What is sentiment of follow review?",
+	"choice": ["it's great", "it's terrible"],
+	"answer": "",
+	"label": 0,
+	"id": 19
+}]
+
+if args.train:
+	model.train(train_data, dev_data)
+result = model.predict(test_data)
+```
+## Pretrained Model
+For the English model, the model was pre-trained with 14 multiplechoice datasets. For the Chinese model, we have collected 48 datasets to pre-train the model, and we have open sourced the pre-trained model to the HuggingFace community.
+
+| Model | URL   |
+|:---------:|:--------------:|
+| Erlangshen-UniMC-Albert-235-English  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English)   |
+| Erlangshen-UniMC-RoBERTa-110M-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese)       |
+| Erlangshen-UniMC-RoBERTa-330M-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UnimC-RoBERTa-330M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese)   |
+| Erlangshen-UniMC-MegatronBERT-1.3B-Chinese  | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese)       |
+
+
+## Experiments
+To evaluate the performance of UniMC, we use 14 multiple-choice datasets to pre-train the model with the ability to make choices
+
+**Zero-shot**
+| Model   | T0 11B | GLaM 60B | FLAN 137B | PaLM 540B | UniMC 235M |
+|---------|--------|----------|-----------|-----------|------------|
+| ANLI R1 | 43.6   | 40.9     | 47.7      | 48.4      | **52.0**         |
+| ANLI R2 | 38.7   | 38.2     | 43.9      | 44.2      | **44.4**       |
+| ANLI R3 | 41.3   | 40.9     | 47.0        | 45.7      | **47.8**       |
+| CB      | 70.1   | 33.9     | 64.1      | 51.8      | **75.7**       |
+
+## Citation
+If this repository helps you, please cite this paper:
+
+```text
+@article{unimc,
+  author    = {Ping Yang and
+               Junjie Wang and
+               Ruyi Gan and
+               Xinyu Zhu and
+               Lin Zhang and
+               Ziwei Wu and
+               Xinyu Gao and
+               Jiaxing Zhang and
+               Tetsuya Sakai},
+  title     = {Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective},
+  journal   = {CoRR},
+  volume    = {abs/2210.08590},
+  year      = {2022}
+}
+```
+
+## License
+
+[Apache License 2.0](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/LICENSE)
+
diff --git a/fengshen/examples/unimc/example.py b/fengshen/examples/unimc/example.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f6a257ad2438fe2158c6a66cb69b0ce9704e90b
--- /dev/null
+++ b/fengshen/examples/unimc/example.py
@@ -0,0 +1,82 @@
+import argparse
+from fengshen.pipelines.multiplechoice import UniMCPipelines
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser = UniMCPipelines.piplines_args(total_parser)
+    args = total_parser.parse_args()
+
+    pretrained_model_path = 'IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese'
+    args.learning_rate = 2e-5
+    args.max_length = 512
+    args.max_epochs = 3
+    args.batchsize = 8
+    args.train = 'train'
+    args.default_root_dir = './'
+
+    model = UniMCPipelines(args, model_path=pretrained_model_path)
+
+    train_data = [    # 训练数据
+        {
+            "texta": "凌云研发的国产两轮电动车怎么样，有什么惊喜？",
+            "textb": "",
+            "question": "下面新闻属于哪一个类别？",
+            "choice": [
+                "教育",
+                "科技",
+                "军事",
+                "旅游",
+                "国际",
+                "股票",
+                "农业",
+                "电竞"
+            ],
+            "answer": "科技",
+            "label": 1,
+            "id": 0
+        }
+    ]
+    dev_data = [     # 验证数据
+        {
+            "texta": "我四千一个月，老婆一千五一个月，存款八万且有两小孩，是先买房还是先买车？",
+            "textb": "",
+            "question": "下面新闻属于哪一个类别？",
+            "choice": [
+                "故事",
+                "文化",
+                "娱乐",
+                "体育",
+                "财经",
+                "房产",
+                "汽车"
+            ],
+            "answer": "汽车",
+            "label": 6,
+            "id": 0
+        }
+    ]
+    test_data = [    # 测试数据
+        {"texta": "街头偶遇2018款长安CS35，颜值美炸！或售6万起，还买宝骏510？",
+         "textb": "",
+         "question": "下面新闻属于哪一个类别？",
+         "choice": [
+             "房产",
+             "汽车",
+             "教育",
+             "军事"
+         ],
+         "answer": "汽车",
+         "label": 1,
+         "id": 7759}
+    ]
+
+    if args.train:
+        model.train(train_data, dev_data)
+    result = model.predict(test_data)
+    for line in result:
+        print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/unimc/unimc.jpg b/fengshen/examples/unimc/unimc.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..53715d9ac87d78b8d6cbcf65f7c8190a6e0fae05
Binary files /dev/null and b/fengshen/examples/unimc/unimc.jpg differ
diff --git a/fengshen/examples/wenzhong_qa/README.md b/fengshen/examples/wenzhong_qa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b424909f39c5b1480fbc5cc7015e82714292930
--- /dev/null
+++ b/fengshen/examples/wenzhong_qa/README.md
@@ -0,0 +1,75 @@
+# <center> yuyuanQA模型finetune
+本示例主要实现了基于GPT2结构的Yuyuan医疗大模型，通过医疗问答对Finetune，使大模型能够有closebook-qa的能力。
+### 数据和模型
+#### 模型：
+finetune的模型是yuyuan模型，余元模型是GPT2的结构，在预训练阶段主要是用PubMed医疗相关的数据集进行的预训练。是一个医疗领域的大模型。模型共有35亿参数，主要参数如下表所示：
+
+|    配置     | 参数  |
+| :---------: | :---: |
+|   nlayers   |  30   |
+|  nheaders   |  32   |
+| hidden-size | 3072  |
+| seq-length  | 1024  |
+
+预训练的数据，主要医疗相关的论文、杂志期刊等，以英文语料为主。
+#### 数据：
+用于finetune的语料是清洗于[MedQuAD](https://github.com/abachaa/MedQuAD)数据集，清洗完成后是下面的格式：
+```text
+......
+{'question':'.........','answer':'........'}
+{'question':'.........','answer':'........'}
+......
+```
+### finetune框架以及参数配置
+#### 框架 ：
+finetune的框架是IDEA研究院CCNL小组整合各大框架的优点开源的[封神框架](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen)，具体代码可以参考[finetune_medicalQA.py](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/dev_wzw/fengshen/examples/wenzhong_qa/finetune_medicalQA.py)和[medicalQADataset.py](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/dev_wzw/fengshen/data/task_dataloader/medicalQADataset.py)。
+#### 训练参数：
+训练参数，我们采用了deepspeed相关的配置，用2个集群的节点共16张A100，在很短的时间内完成了finetune。具体参数配置可以参考[finetune_GPT2_medicalQA.sh](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/dev_wzw/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh)
+### finetune后的效果以及使用
+#### 效果对比：
+finetune后的模型，用100对问答对，基于BLEU分与之前用Magetron框架训练的模型进行了简单的对比，效果比较接近。
+
+unsmoth method:
+| 框架     | 1-gram             | 2-gram             | 3-gram             | 4-gram              |
+| -------- | ------------------ | ------------------ | ------------------ | ------------------- |
+| Fengshen | 0.5241376169070796 | 0.5215762466122144 | 0.4894353584800885 | 0.44840139357073466 |
+| Magetron | 0.5321340489166898 | 0.5110257474778213 | 0.4703745962926368 | 0.4310875933354554  |
+
+smoth method:
+| 框架     | 1-gram            | 2-gram             | 3-gram             | 4-gram             |
+| -------- | ----------------- | ------------------ | ------------------ | ------------------ |
+| Fengshen | 0.717829796617609 | 0.6516910802858905 | 0.5859726677095979 | 0.525510691686505  |
+| Magetron | 0.776190980974117 | 0.6749801211321476 | 0.5897846253142169 | 0.5230773076722481 |
+#### 使用方式：
+支持直接用Haggingface或者pytorch-lightning框架调用。由于在finetune的时候，加入了prompt，在问答的时候，输入应该是："
+`Question:your question about medical? answer:`",接着模型就回以续写的方式回答你的问题。用huggingface的调用代码可以参考下面的代码：
+```python 
+from transformers import GPT2Tokenizer,GPT2LMHeadModel
+model_path = 'pretrained_model_hf/yuyuanQA-v1' # input your own model file path
+model = GPT2LMHeadModel.from_pretrained(model_path)
+tokenizer = GPT2Tokenizer.from_pretrained(model_path)
+model = model.cuda(6) # move your model to the GPU
+model.eval() # just do predict
+
+def answering(question):
+# question = "What should gout patients pay attention to in diet?"
+    inputs = tokenizer(f'Question:{question} answer:',return_tensors='pt').input_ids.to(model.device)
+    
+    generation_output = model.generate(input_ids = inputs,
+                                return_dict_in_generate=True,
+                                output_scores=True,
+                                max_length=150,
+                                # max_new_tokens=80,
+                                do_sample=True,
+                                top_p = 0.9,
+                                eos_token_id=50256,
+                                pad_token_id=0,
+                                num_return_sequences = 5)
+    answers = []
+    for idx,sentence in enumerate(generation_output.sequences):
+        next_sentence = tokenizer.decode(sentence).split('<|endoftext|>')[0]
+        answer = next_sentence.split(sep='answer:',maxsplit=1)[1]
+        answers.append(answer)
+    return answers
+answering('your question?')
+```
\ No newline at end of file
diff --git a/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh b/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d9a81670ed121ecfb9fa3e0e546f0773374087af
--- /dev/null
+++ b/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+#SBATCH --job-name=medical_qa_finetune
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8               # number of gpus
+#SBATCH -o /cognitive_comp/wuziwei/task/fs_medical_qa_finetune/%x-%j.log
+#SBATCH -e /cognitive_comp/wuziwei/task/fs_medical_qa_finetune/%x-%j.err
+#SBATCH -x dgx[050,049]
+
+#export NCCL_DEBUG=INFO
+
+# export PATH=$PATH:/cognitive_comp/wuziwei/codes/fengshen/fengshen
+set -x -e
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=1
+ROOT_DIR=/cognitive_comp/wuziwei/task/fs_medical_qa_finetune
+
+ZERO_STAGE=2
+
+config_json="$ROOT_DIR/training_config.json"
+export MASTER_PORT=$[RANDOM%10000+30000]
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "zero_optimization": {
+    "stage": $ZERO_STAGE,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "allgather_bucket_size": 2e8
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-5,
+      "betas": [0.9,0.95],
+      "eps": 1e-8,
+      "weight_decay": 1e-2
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params":{
+      "warmup_min_lr": 5e-6,
+      "warmup_max_lr": 1e-5
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 32,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false,
+  "zero_allow_untested_optimizer": false,
+  "train_micro_batch_size_per_gpu": 1,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0
+}
+EOT
+
+# export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/wuziwei/torch_extendsions
+TRAINER_ARGS="
+    --max_epochs 10 \
+    --gpus 16 \
+    --num_nodes 2 \
+    --strategy deepspeed_stage_2 \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+"
+DATA_DIR=/cognitive_comp/wuziwei/task-data/medical_qa
+DATA_ARGS="
+    --data_dir $DATA_DIR \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data train.txt \
+    --valid_data valid.txt \
+    --test_data  test.txt
+"
+
+# PRETRAINED_MODEL_PATH=/cognitive_comp/wuziwei/pretrained_model_hf/gpt2
+PRETRAINED_MODEL_PATH=/cognitive_comp/wuziwei/pretrained_model_hf/medical_v2
+MODEL_ARGS="
+    --pretrained_model_path ${PRETRAINED_MODEL_PATH} \
+    --output_save_path $ROOT_DIR/predict.json \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --warmup 0.01 \
+"
+
+SCRIPTS_PATH=/cognitive_comp/wuziwei/codes/fengshen/fengshen/examples/GPT_pretrain_finetune/finetune_medicalQA.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+SINGULARITY_PATH=/cognitive_comp/wuziwei/container/oneflow-cuda11.sif
+# singularity exec --nv -B /cognitive_comp/wuziwei/:/cognitive_comp/wuziwei/ $SINGULARITY_PATH python $CMD
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+
+srun singularity exec --nv -B /cognitive_comp/wuziwei/:/cognitive_comp/wuziwei/ $SINGULARITY_PATH bash -c 'python $CMD'
diff --git a/fengshen/examples/wenzhong_qa/finetune_medicalQA.py b/fengshen/examples/wenzhong_qa/finetune_medicalQA.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a79948d5f7fe736856e44392a834edfa6ac51d9
--- /dev/null
+++ b/fengshen/examples/wenzhong_qa/finetune_medicalQA.py
@@ -0,0 +1,176 @@
+from transformers import GPT2LMHeadModel
+from data.task_dataloader.medicalQADataset import GPT2QADataModel
+from transformers.optimization import get_linear_schedule_with_warmup
+from pytorch_lightning import Trainer, loggers
+from pytorch_lightning.callbacks import ModelCheckpoint
+import pytorch_lightning as pl
+import argparse
+import torch
+import os
+import sys
+sys.path.insert(0, '/cognitive_comp/wuziwei/codes/fengshen/fengshen')
+# sys.path.append('../../')
+# sys.path.append('../')
+# os.environ["CUDA_VISIBLE_DEVICES"] = '4,5,6,7'
+
+
+class GPT2FinetuneMedicalQAModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./ckpt/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+        parser.add_argument('--save_last', action='store_true', default=True)
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=1000, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         #  every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename,
+                                         save_last=args.save_last)
+
+
+class GPT2FinetuneMedicalQA(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--learning_rate', default=1e-4, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        return parent_args
+
+    def __init__(self, args, num_data):
+        super().__init__()
+        self.args = args
+        self.num_data = num_data
+        print('num_data:', num_data)
+        self.model = GPT2LMHeadModel.from_pretrained(
+            args.pretrained_model_path)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
+            self.total_step = int(self.trainer.max_epochs * self.num_data /
+                                  (max(1, num_gpus) * self.trainer.accumulate_grad_batches))
+            print('Total training step:', self.total_step)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'], labels=batch['labels'])
+        # output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('train_loss', output.loss)
+        return output.loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(input_ids=batch['input_ids'],
+                            attention_mask=batch['attention_mask'], labels=batch['labels'])
+        # output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss)
+        # self.log('val_acc', acc)
+
+    def configure_optimizers(self):
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.args.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_step * self.args.warmup),
+            self.total_step)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+
+def main():
+    total_parser = argparse.ArgumentParser("Summary Task")
+    total_parser.add_argument(
+        '--do_eval_only', action='store_true', default=False)
+    total_parser.add_argument(
+        '--pretrained_model_path', default=None, type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = GPT2QADataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = GPT2FinetuneMedicalQAModelCheckpoint.add_argparse_args(
+        total_parser)
+    total_parser = GPT2FinetuneMedicalQA.add_model_specific_args(total_parser)
+    # * Args for base model
+    args = total_parser.parse_args()
+
+    data_model = GPT2QADataModel(args)
+    if not args.do_eval_only:
+        model = GPT2FinetuneMedicalQA(args, len(data_model.train_dataloader()))
+        checkpoint_callback = GPT2FinetuneMedicalQAModelCheckpoint(
+            args).callbacks
+        logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+            args.default_root_dir, 'log/'), name='MedicalQA-GPT2')
+        trainer = Trainer.from_argparse_args(args,
+                                             logger=logger,
+                                             callbacks=[checkpoint_callback]
+                                             )
+        trainer.fit(model, data_model)
+
+        # result = trainer.predict(model, data_model)
+        # with open('test_results.txt', 'wt', encoding='utf-8') as w:
+        #     for line in result:
+        #         w.writelines(line)
+
+        model.model.save_pretrained(
+            '/cognitive_comp/wuziwei/pretrained_model_hf')
+    else:
+        print('save to hf.....')
+        trainer = Trainer.from_argparse_args(args)
+        model = GPT2FinetuneMedicalQA(
+            args, len(data_model.predict_dataloader()))
+
+        result = trainer.predict(
+            model, data_model, ckpt_path='/cognitive_comp/wuziwei/task/fs_medical_qa_finetune/ckpt/last.ckpt')
+        # with open('test_results.txt','wt',encoding='utf-8') as w:
+        #     for line in result:
+        #         w.writelines(line)
+
+        model.model.save_pretrained(
+            '/cognitive_comp/wuziwei/pretrained_model_hf')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fengshen/examples/wenzhong_qa/finetune_wenzhong.py b/fengshen/examples/wenzhong_qa/finetune_wenzhong.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcdeda71fd2d2d70dd56148451ddf2d4946bf31c
--- /dev/null
+++ b/fengshen/examples/wenzhong_qa/finetune_wenzhong.py
@@ -0,0 +1,153 @@
+# sys.path.append('./')
+import os
+import torch
+import argparse
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import Trainer, loggers
+from transformers.optimization import get_linear_schedule_with_warmup
+from transformers import GPT2LMHeadModel
+from fengshen.data.task_dataloader.medicalQADataset import GPT2QADataModel
+
+
+class GPT2FinetuneMedicalQAModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./ckpt/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+        parser.add_argument('--save_last', action='store_true', default=True)
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename,
+                                         save_last=args.save_last)
+
+
+class GPT2FinetuneMedicalQA(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--learning_rate', default=1e-4, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        return parent_args
+
+    def __init__(self, args, num_data):
+        super().__init__()
+        self.args = args
+        self.num_data = num_data
+        print('num_data:', num_data)
+        self.model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
+            self.total_step = int(self.trainer.max_epochs * self.num_data
+                                  / (max(1, num_gpus) * self.trainer.accumulate_grad_batches))
+            print('Total training step:', self.total_step)
+
+    def training_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
+        # output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('train_loss', output.loss)
+        return output.loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float()) / labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        output = self.model(
+            input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
+        # output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
+        # acc = self.comput_metrix(output.logits, batch['labels'])
+        self.log('val_loss', output.loss)
+        # self.log('val_acc', acc)
+
+    def configure_optimizers(self):
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.args.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_step * self.args.warmup),
+            self.total_step)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+
+def main():
+    total_parser = argparse.ArgumentParser("QA Task")
+    total_parser.add_argument('--do_eval_only', action='store_true', default=False)
+    total_parser.add_argument('--pretrained_model_path', default='google/mt5-small', type=str)
+    total_parser.add_argument('--output_save_path', default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = GPT2QADataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = Trainer.add_argparse_args(total_parser)
+    total_parser = GPT2FinetuneMedicalQAModelCheckpoint.add_argparse_args(total_parser)
+    total_parser = GPT2FinetuneMedicalQA.add_model_specific_args(total_parser)
+    # * Args for base model
+    args = total_parser.parse_args()
+
+    data_model = GPT2QADataModel(args)
+    if not args.do_eval_only:
+        model = GPT2FinetuneMedicalQA(args, len(data_model.train_dataloader()))
+        checkpoint_callback = GPT2FinetuneMedicalQAModelCheckpoint(args).callbacks
+        logger = loggers.TensorBoardLogger(save_dir=os.path.join(
+            args.default_root_dir, 'log/'), name='WenZhong')
+        trainer = Trainer.from_argparse_args(args,
+                                             logger=logger,
+                                             callbacks=[checkpoint_callback]
+                                             )
+        trainer.fit(model, data_model)
+
+
+if __name__ == '__main__':
+    main()
+    # test()
+
+'''
+# python examples/mt5_summary.py --gpus=1 --test_data=test_public.jsonl
+# --default_root_dir=/cognitive_comp/ganruyi/fengshen/mt5_summary/eval
+# --do_eval_only
+# --resume_from_checkpoint=/cognitive_comp/ganruyi/fengshen/mt5_summary/ckpt/model-epoch=01-train_loss=1.9166.ckpt
+# --strategy=ddp
+'''
diff --git a/fengshen/examples/wenzhong_qa/finetune_wenzhong.sh b/fengshen/examples/wenzhong_qa/finetune_wenzhong.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0100377bf5c54c0eba3088e3b09368a5b31f9c06
--- /dev/null
+++ b/fengshen/examples/wenzhong_qa/finetune_wenzhong.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+#SBATCH --job-name=finetune_wenzhong
+#SBATCH --cpus-per-task=50
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:1               # number of gpus
+#SBATCH -o %x-%j.log
+#SBATCH -e %x-%j.err
+
+set -x -e
+
+export MASTER_PORT=$[RANDOM%10000+50000]
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/torch_extendsions
+
+echo "START TIME: $(date)"
+MICRO_BATCH_SIZE=1
+ROOT_DIR=/cognitive_comp/gaoxinyu/FS/fengshen/fengshen
+
+ZERO_STAGE=3
+
+config_json="$ROOT_DIR/ds_config.$SLURM_JOBID.json"
+#config_json="$ROOT_DIR/ds_config.wzw.json"
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+    "train_micro_batch_size_per_gpu":1,
+    "steps_per_print":100,
+    "gradient_clipping":1,
+    "zero_optimization":{
+        "stage": $ZERO_STAGE,
+        "offload_optimizer":{
+          "device":"cpu",
+          "pin_memory":true
+        },
+        "offload_param":{
+          "device":"cpu",
+          "pin_memory":true
+        },
+        "overlap_comm":true,
+        "contiguous_gradients":true,
+        "sub_group_size":1000000000,
+        "stage3_max_live_parameters":1000000000,
+        "stage3_max_reuse_distance":1000000000,
+        "stage3_gather_fp16_weights_on_model_save":true
+    },
+    "optimizer":{
+        "type":"Adam",
+        "params":{
+            "lr": 1e-5,
+            "weight_decay":0.01
+        }
+    },
+    "scheduler":{
+        "type":"WarmupLR",
+        "params":{
+            "warmup_min_lr":5e-6,
+            "warmup_max_lr":1e-5
+        }
+    },
+    "zero_allow_untested_optimizer":false,
+    "fp16":{
+        "enabled":true,
+        "loss_scale":0,
+        "loss_scale_window":1000,
+        "hysteresis":2,
+        "min_loss_scale":1
+    },
+    "activation_checkpointing":{
+        "partition_activations":false,
+        "contiguous_memory_optimization":false
+    },
+    "wall_clock_breakdown":false
+}
+EOT
+
+export PL_DEEPSPEED_CONFIG_PATH=$config_json
+
+TRAINER_ARGS="
+    --max_epochs 2 \
+    --gpus 1 \
+    --num_nodes 1 \
+    --strategy deepspeed_stage_3 \
+    --precision 16 \
+    --default_root_dir $ROOT_DIR \
+    --dirpath $ROOT_DIR/ckpt \
+    --save_top_k 3 \
+    --monitor train_loss \
+    --mode min \
+    --save_last \
+"
+DATA_DIR=/cognitive_comp/gaoxinyu/data/yuyuan
+DATA_ARGS="
+    --data_dir $DATA_DIR \
+    --train_batchsize $MICRO_BATCH_SIZE \
+    --valid_batchsize $MICRO_BATCH_SIZE \
+    --train_data train.txt \
+    --valid_data valid.txt \
+    --test_data  test.txt
+"
+
+MODEL_ARGS="
+    --pretrained_model_path /cognitive_comp/gaoxinyu/hf_model/wenzhong \
+    --output_save_path $ROOT_DIR/predict.json \
+    --learning_rate 1e-4 \
+    --weight_decay 0.1 \
+    --warmup 0.01 \
+"
+
+SCRIPTS_PATH=/cognitive_comp/gaoxinyu/FS/fengshen/finetune_wenzhong.py
+
+export CMD=" \
+    $SCRIPTS_PATH \
+    $TRAINER_ARGS \
+    $MODEL_ARGS \
+    $DATA_ARGS \
+    "
+
+echo $CMD
+
+SINGULARITY_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif
+
+# to debug - add echo (it exits and prints what it would have launched)
+#run_cmd="$PY_LAUNCHER $CMD"
+
+clear; srun --jobid $SLURM_JOBID singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c 'python $CMD'
+# bash -c 'python $CMD'
\ No newline at end of file
diff --git a/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py b/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..1404571159ea95776c3953fdecb28a84031c1347
--- /dev/null
+++ b/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py
@@ -0,0 +1,610 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from fengshen.models.zen1.tokenization import BertTokenizer
+from fengshen.models.zen1.modeling import ZenForSequenceClassification
+from fengshen.models.zen1.ngram_utils import ZenNgramDict
+from pytorch_lightning.callbacks import LearningRateMonitor
+import csv
+from dataclasses import dataclass
+import logging
+import math
+import numpy as np
+import os
+from tqdm import tqdm
+import json
+import torch
+import pytorch_lightning as pl
+from random import shuffle
+import argparse
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import Dataset, DataLoader
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths,
+                 ngram_tuples, ngram_seg_ids, ngram_masks):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+        self.ngram_ids = ngram_ids
+        self.ngram_positions = ngram_positions
+        self.ngram_lengths = ngram_lengths
+        self.ngram_tuples = ngram_tuples
+        self.ngram_seg_ids = ngram_seg_ids
+        self.ngram_masks = ngram_masks
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_examples(self, data_path, mode):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                # if sys.version_info[0] == 2:
+                #     line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+    @classmethod
+    def _read_json(cls, input_file):
+        """Reads a jsonl file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            samples = []
+            for line in tqdm(lines):
+                data = json.loads(line)
+                samples.append(data)
+            return samples
+
+
+class TnewsProcessor(DataProcessor):
+    """Processor for the tnews data set (HIT version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_json(os.path.join(data_dir, "train.json")), "train")
+
+    def get_examples(self, data_path, mode):
+        return self._create_examples(
+            self._read_json(data_path),
+            set_type=mode
+        )
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # if i == 0:
+            #    continue
+            guid = "%s-%s" % (set_type, i)
+            # text_a = line[0]
+            text_a = line['sentence']
+            label = line['label'] if 'label' in line.keys() else None
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, label=label))
+        return examples
+
+
+class OcnliProcessor(DataProcessor):
+    """Processor for the ocnli or cmnli data set (HIT version)."""
+
+    def get_examples(self, data_path, mode):
+        return self._create_examples(
+            self._read_json(data_path),
+            set_type=mode
+        )
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # if i == 0:
+            #    continue
+            guid = "%s-%s" % (set_type, i)
+            # text_a = line[0]
+            text_a = line['sentence1']
+            text_b = line['sentence2']
+            label = line['label'] if 'label' in line.keys() else None
+            # 特殊处理，cmnli有label为-的
+            if label == '-':
+                label = None
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class IflytekProcessor(DataProcessor):
+    """Processor for the iflytek data set (HIT version)."""
+
+    def get_examples(self, data_path, mode):
+        return self._create_examples(
+            self._read_json(data_path),
+            set_type=mode
+        )
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # if i == 0:
+            #    continue
+            guid = "%s-%s" % (set_type, i)
+            # text_a = line[0]
+            text_a = line['sentence']
+            label = line['label'] if 'label' in line.keys() else None
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        label_id = label_map[example.label]
+
+        # ----------- code for ngram BEGIN-----------
+        ngram_matches = []
+        #  Filter the word segment from 2 to 7 to check whether there is a word
+        for p in range(2, 8):
+            for q in range(0, len(tokens) - p + 1):
+                character_segment = tokens[q:q + p]
+                # j is the starting position of the word
+                # i is the length of the current word
+                character_segment = tuple(character_segment)
+                if character_segment in ngram_dict.ngram_to_id_dict:
+                    ngram_index = ngram_dict.ngram_to_id_dict[character_segment]
+                    ngram_matches.append([ngram_index, q, p, character_segment])
+
+        shuffle(ngram_matches)
+        # max_word_in_seq_proportion = max_word_in_seq
+        max_word_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq)
+        if len(ngram_matches) > max_word_in_seq_proportion:
+            ngram_matches = ngram_matches[:max_word_in_seq_proportion]
+        ngram_ids = [ngram[0] for ngram in ngram_matches]
+        ngram_positions = [ngram[1] for ngram in ngram_matches]
+        ngram_lengths = [ngram[2] for ngram in ngram_matches]
+        ngram_tuples = [ngram[3] for ngram in ngram_matches]
+        ngram_seg_ids = [0 if position < (len(tokens_a) + 2) else 1 for position in ngram_positions]
+
+        ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool)
+        ngram_mask_array[:len(ngram_ids)] = 1
+
+        # record the masked positions
+        ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32)
+        for i in range(len(ngram_ids)):
+            ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = 1.0
+
+        # Zero-pad up to the max word in seq length.
+        padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids))
+        ngram_ids += padding
+        ngram_lengths += padding
+        ngram_seg_ids += padding
+
+        # ----------- code for ngram END-----------
+        label_id = label_map[example.label] if example.label is not None else 0
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id,
+                          ngram_ids=ngram_ids,
+                          ngram_positions=ngram_positions_matrix,
+                          ngram_lengths=ngram_lengths,
+                          ngram_tuples=ngram_tuples,
+                          ngram_seg_ids=ngram_seg_ids,
+                          ngram_masks=ngram_mask_array))
+
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+class TaskDataset(Dataset):
+    def __init__(self, data_path, processor, mode='train'):
+        super().__init__()
+        self.data = self.load_data(data_path, processor, mode)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, data_path, processor, mode):
+        if mode == "train":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "test":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "dev":
+            examples = processor.get_examples(data_path, mode)
+        return examples
+
+
+@dataclass
+class TaskCollator:
+    args = None
+    tokenizer = None
+    ngram_dict = None
+    label2id = None
+
+    def __call__(self, samples):
+        features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict)
+        # logger.info("  Num examples = %d", len(samples))
+        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long)
+        ngram_positions = torch.tensor([f.ngram_positions for f in features], dtype=torch.long)
+        # ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long)
+        # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long)
+        # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long)
+
+        return {
+            'input_ids': input_ids,
+            'input_ngram_ids': ngram_ids,
+            'ngram_position_matrix': ngram_positions,
+            'attention_mask': input_mask,
+            'token_type_ids': segment_ids,
+            'labels': label_ids,
+
+        }
+        # return default_collate(sample_list)
+
+
+class TaskDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--data_dir', default='./data', type=str)
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_data', default='train.json', type=str)
+        parser.add_argument('--valid_data', default='dev.json', type=str)
+        parser.add_argument('--test_data', default='test.json', type=str)
+        parser.add_argument('--train_batchsize', default=16, type=int)
+        parser.add_argument('--valid_batchsize', default=32, type=int)
+        parser.add_argument('--max_seq_length', default=128, type=int)
+
+        parser.add_argument('--texta_name', default='text', type=str)
+        parser.add_argument('--textb_name', default='sentence2', type=str)
+        parser.add_argument('--label_name', default='label', type=str)
+        parser.add_argument('--id_name', default='id', type=str)
+
+        parser.add_argument('--dataset_name', default=None, type=str)
+        parser.add_argument('--vocab_file',
+                            type=str, default=None,
+                            help="Vocabulary mapping/file BERT was pretrainined on")
+        parser.add_argument("--do_lower_case",
+                            action='store_true',
+                            help="Set this flag if you are using an uncased model.")
+        parser.add_argument('--task_name', default='tnews', type=str)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        self.collator = TaskCollator()
+        self.collator.args = args
+        self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case)
+        self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer)
+
+        processors = {
+            'afqmc': OcnliProcessor,
+            'tnews': TnewsProcessor,
+            'ocnli': OcnliProcessor,
+            'cmnli': OcnliProcessor,
+            'iflytek': IflytekProcessor,
+        }
+        if args.task_name not in processors:
+            raise ValueError("Task not found: %s" % (args.task_name))
+        processor = processors[args.task_name]()
+        if args.dataset_name is None:
+            self.label2id, self.id2label = self.load_schema(os.path.join(
+                args.data_dir, args.train_data), args)
+            self.train_data = TaskDataset(os.path.join(
+                args.data_dir, args.train_data), processor, mode='train')
+            self.valid_data = TaskDataset(os.path.join(
+                args.data_dir, args.valid_data), processor, mode='dev')
+            self.test_data = TaskDataset(os.path.join(
+                args.data_dir, args.test_data), processor, mode='test')
+            self.collator.label2id = self.label2id
+        else:
+            import datasets
+            ds = datasets.load_dataset(args.dataset_name)
+            self.train_data = ds['train']
+            self.valid_data = ds['validation']
+            self.test_data = ds['test']
+        self.save_hyperparameters(args)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def load_schema(self, data_path, args):
+        with open(data_path, 'r', encoding='utf8') as f:
+            lines = f.readlines()
+            label_list = []
+            for line in tqdm(lines):
+                data = json.loads(line)
+                labels = data[args.label_name] if args.label_name in data.keys(
+                ) else 0
+                if labels not in label_list:
+                    label_list.append(labels)
+
+        label2id, id2label = {}, {}
+        for i, k in enumerate(label_list):
+            label2id[k] = i
+            id2label[i] = k
+        return label2id, id2label
+
+
+class LitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--num_labels', default=2, type=int)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.model = ZenForSequenceClassification.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels)
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        loss, logits = self.model(**batch)
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('train_loss', loss)
+        self.log('train_acc', acc)
+        return loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        loss, logits = self.model(**batch)
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('val_loss', loss)
+        self.log('val_acc', acc)
+
+    def predict_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        return output.logits
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+class TaskModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./log/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename)
+
+
+def save_test(data, args, data_model):
+    with open(args.output_save_path, 'w', encoding='utf-8') as f:
+        idx = 0
+        for i in range(len(data)):
+            batch = data[i]
+            for sample in batch:
+                tmp_result = dict()
+                label_id = np.argmax(sample.numpy())
+                tmp_result['id'] = data_model.test_data.data[idx]['id']
+                tmp_result['label'] = data_model.id2label[label_id]
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data+'\n')
+                idx += 1
+    print('save the result to '+args.output_save_path)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--pretrained_model_path', default='', type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = TaskDataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = pl.Trainer.add_argparse_args(total_parser)
+    total_parser = TaskModelCheckpoint.add_argparse_args(total_parser)
+
+    # * Args for base model
+    from fengshen.models.model_utils import add_module_args
+    total_parser = add_module_args(total_parser)
+    total_parser = LitModel.add_model_specific_args(total_parser)
+
+    args = total_parser.parse_args()
+
+    checkpoint_callback = TaskModelCheckpoint(args).callbacks
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer.from_argparse_args(args,
+                                            callbacks=[checkpoint_callback, lr_monitor]
+                                            )
+
+    data_model = TaskDataModel(args)
+    model = LitModel(args)
+    trainer.fit(model, data_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py b/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cb77bbe0edf675300614982466e802964f8c625
--- /dev/null
+++ b/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py
@@ -0,0 +1,647 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from fengshen.models.zen1.ngram_utils import ZenNgramDict
+from fengshen.models.zen1.modeling import ZenForTokenClassification
+from fengshen.metric.metric import SeqEntityScore
+from fengshen.models.zen1.tokenization import BertTokenizer
+from random import shuffle
+from pytorch_lightning.callbacks import LearningRateMonitor
+from dataclasses import dataclass
+import logging
+import math
+import numpy as np
+import os
+import json
+import torch
+import pytorch_lightning as pl
+import argparse
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import Dataset, DataLoader
+
+import torch.nn.functional as F
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.ERROR)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths,
+                 ngram_tuples, ngram_seg_ids, ngram_masks, valid_ids=None, label_mask=None):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.valid_ids = valid_ids
+        self.label_mask = label_mask
+
+        self.ngram_ids = ngram_ids
+        self.ngram_positions = ngram_positions
+        self.ngram_lengths = ngram_lengths
+        self.ngram_tuples = ngram_tuples
+        self.ngram_seg_ids = ngram_seg_ids
+        self.ngram_masks = ngram_masks
+
+
+def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # label_map = {label: i for i, label in enumerate(label_list, 1)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        textlist = example.text_a
+        labellist = example.label
+        tokens = []
+        labels = []
+        valid = []
+        label_mask = []
+        for i, word in enumerate(textlist):
+            token = tokenizer.tokenize(word)
+            tokens.extend(token)
+            label_1 = labellist[i]
+            for m in range(len(token)):
+                if m == 0:
+                    labels.append(label_1)
+                    valid.append(1)
+                    label_mask.append(1)
+                else:
+                    valid.append(0)
+        if len(tokens) >= max_seq_length - 1:
+            tokens = tokens[0:(max_seq_length - 2)]
+            labels = labels[0:(max_seq_length - 2)]
+            valid = valid[0:(max_seq_length - 2)]
+            label_mask = label_mask[0:(max_seq_length - 2)]
+        ntokens = []
+        segment_ids = []
+        label_ids = []
+        ntokens.append("[CLS]")
+        segment_ids.append(0)
+        valid.insert(0, 1)
+        label_mask.insert(0, 1)
+        label_ids.append(label_map["[CLS]"])
+        for i, token in enumerate(tokens):
+            ntokens.append(token)
+            segment_ids.append(0)
+            if len(labels) > i:
+                label_ids.append(label_map[labels[i]])
+        ntokens.append("[SEP]")
+        segment_ids.append(0)
+        valid.append(1)
+        label_mask.append(1)
+        label_ids.append(label_map["[SEP]"])
+        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
+        input_mask = [1] * len(input_ids)
+        label_mask = [1] * len(label_ids)
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+            label_ids.append(0)
+            valid.append(1)
+            label_mask.append(0)
+        while len(label_ids) < max_seq_length:
+            label_ids.append(0)
+            label_mask.append(0)
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        assert len(label_ids) == max_seq_length
+        assert len(valid) == max_seq_length
+        assert len(label_mask) == max_seq_length
+
+        # ----------- code for ngram BEGIN-----------
+        ngram_matches = []
+        #  Filter the ngram segment from 2 to 7 to check whether there is a ngram
+        for p in range(2, 8):
+            for q in range(0, len(tokens) - p + 1):
+                character_segment = tokens[q:q + p]
+                # j is the starting position of the ngram
+                # i is the length of the current ngram
+                character_segment = tuple(character_segment)
+                if character_segment in ngram_dict.ngram_to_id_dict:
+                    ngram_index = ngram_dict.ngram_to_id_dict[character_segment]
+                    ngram_matches.append([ngram_index, q, p, character_segment])
+
+        shuffle(ngram_matches)
+
+        max_ngram_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq)
+        if len(ngram_matches) > max_ngram_in_seq_proportion:
+            ngram_matches = ngram_matches[:max_ngram_in_seq_proportion]
+
+        ngram_ids = [ngram[0] for ngram in ngram_matches]
+        ngram_positions = [ngram[1] for ngram in ngram_matches]
+        ngram_lengths = [ngram[2] for ngram in ngram_matches]
+        ngram_tuples = [ngram[3] for ngram in ngram_matches]
+        ngram_seg_ids = [0 if position < (len(tokens) + 2) else 1 for position in ngram_positions]
+
+        ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool)
+        ngram_mask_array[:len(ngram_ids)] = 1
+
+        # record the masked positions
+        ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32)
+        for i in range(len(ngram_ids)):
+            ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = 1.0
+
+        # Zero-pad up to the max ngram in seq length.
+        padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids))
+        ngram_ids += padding
+        ngram_lengths += padding
+        ngram_seg_ids += padding
+
+        # ----------- code for ngram END-----------
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_ids,
+                          ngram_ids=ngram_ids,
+                          ngram_positions=ngram_positions_matrix,
+                          ngram_lengths=ngram_lengths,
+                          ngram_tuples=ngram_tuples,
+                          ngram_seg_ids=ngram_seg_ids,
+                          ngram_masks=ngram_mask_array,
+                          valid_ids=valid,
+                          label_mask=label_mask))
+    return features
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_examples(self, data_path, set_type, quotechar=' '):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(data_path, self.get_quotechar()), set_type)
+
+    def _create_examples(self, lines, set_type):
+        examples = []
+        for i, (sentence, label) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = sentence
+            label = label
+            examples.append(InputExample(guid=guid, text_a=text_a, label=label))
+        return examples
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def get_quotechar(self):
+        return ' '
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        '''
+        read file
+        return format :
+        [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
+        '''
+        f = open(input_file)
+        data = []
+        sentence = []
+        label = []
+        for line in f:
+            if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
+                if len(sentence) > 0:
+                    data.append((sentence, label))
+                    sentence = []
+                    label = []
+                continue
+            splits = line.split(quotechar)
+            sentence.append(splits[0])
+            label.append(splits[-1][:-1])
+
+        if len(sentence) > 0:
+            data.append((sentence, label))
+            sentence = []
+            label = []
+        return data
+
+
+class MSRAProcessor(DataProcessor):
+    """Processor for the msra data set."""
+
+    def get_labels(self):
+        return ['B-NR', 'B-NS', 'B-NT', 'E-NR', 'E-NS', 'E-NT', 'M-NR',
+                'M-NS', 'M-NT', 'O', 'S-NR', 'S-NS', 'S-NT', '[CLS]', '[SEP]']
+
+
+class OntoNotes4Processor(DataProcessor):
+    """Processor for the OntoNotes4 data set."""
+
+    def get_labels(self):
+        return ['B-GPE', 'B-LOC', 'B-ORG', 'B-PER', 'E-GPE', 'E-LOC',
+                'E-ORG', 'E-PER', 'M-GPE', 'M-LOC', 'M-ORG', 'M-PER', 'O',
+                'S-GPE', 'S-LOC', 'S-ORG', 'S-PER', '[CLS]', '[SEP]']
+
+
+class WeiboProcessor(DataProcessor):
+    """Processor for the Weibo data set."""
+
+    def get_labels(self):
+        return ['B-GPE.NAM', 'B-GPE.NOM', 'B-LOC.NAM', 'B-LOC.NOM',
+                'B-ORG.NAM', 'B-ORG.NOM', 'B-PER.NAM', 'B-PER.NOM', 'E-GPE.NAM',
+                'E-GPE.NOM', 'E-LOC.NAM', 'E-LOC.NOM', 'E-ORG.NAM', 'E-ORG.NOM',
+                'E-PER.NAM', 'E-PER.NOM', 'M-GPE.NAM', 'M-LOC.NAM', 'M-LOC.NOM',
+                'M-ORG.NAM', 'M-ORG.NOM', 'M-PER.NAM', 'M-PER.NOM', 'O',
+                'S-GPE.NAM', 'S-LOC.NOM', 'S-PER.NAM', 'S-PER.NOM', '[CLS]', '[SEP]']
+
+
+class ResumeProcessor(DataProcessor):
+    """Processor for the resume data set."""
+
+    def get_labels(self):
+        return ['B-CONT', 'B-EDU', 'B-LOC', 'B-NAME', 'B-ORG', 'B-PRO',
+                'B-RACE', 'B-TITLE', 'E-CONT', 'E-EDU', 'E-LOC', 'E-NAME',
+                'E-ORG', 'E-PRO', 'E-RACE', 'E-TITLE', 'M-CONT', 'M-EDU',
+                'M-LOC', 'M-NAME', 'M-ORG', 'M-PRO', 'M-RACE', 'M-TITLE',
+                'O', 'S-NAME', 'S-ORG', 'S-RACE', '[CLS]', '[SEP]']
+
+
+class CMeEEProcessor(DataProcessor):
+    """Processor for the CMeEE data set."""
+
+    def get_quotechar(self):
+        return '\t'
+
+    def get_labels(self):
+        return ['B-临床表现', 'B-医学检验项目', 'B-医疗程序', 'B-医疗设备',
+                'B-微生物类', 'B-疾病', 'B-科室', 'B-药物', 'B-身体', 'I-临床表现',
+                'I-医学检验项目', 'I-医疗程序', 'I-医疗设备', 'I-微生物类',
+                'I-疾病', 'I-科室', 'I-药物', 'I-身体', 'O', '[CLS]', '[SEP]']
+
+
+class CLUENERProcessor(DataProcessor):
+    """Processor for the CLUENER data set."""
+
+    def get_quotechar(self):
+        return '\t'
+
+    def get_labels(self):
+        return ['B-书名', 'B-公司', 'B-地址', 'B-姓名', 'B-政府', 'B-景点',
+                'B-游戏', 'B-电影', 'B-组织机构', 'B-职位', 'I-书名', 'I-公司',
+                'I-地址', 'I-姓名', 'I-政府', 'I-景点', 'I-游戏', 'I-电影',
+                'I-组织机构', 'I-职位', 'O', '[CLS]', '[SEP]']
+
+
+class TaskDataset(Dataset):
+    def __init__(self, data_path, processor, mode='train'):
+        super().__init__()
+        self.data = self.load_data(data_path, processor, mode)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, data_path, processor, mode):
+        if mode == "train":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "test":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "dev":
+            examples = processor.get_examples(data_path, mode)
+        return examples
+
+
+@dataclass
+class TaskCollator:
+    args = None
+    tokenizer = None
+    ngram_dict = None
+    label2id = None
+
+    def __call__(self, samples):
+        features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict)
+        # logger.info("  Num examples = %d", len(samples))
+
+        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long)
+
+        ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long)
+        ngram_positions = torch.tensor([f.ngram_positions for f in features], dtype=torch.long)
+        # ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long)
+        # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long)
+        # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long)
+
+        # label_mask = torch.tensor([f.label_mask for f in features], dtype=torch.long)
+        return {
+            'input_ids': input_ids,
+            'ngram_ids': ngram_ids,
+            'ngram_positions': ngram_positions,
+            'attention_mask': input_mask,
+            'token_type_ids': segment_ids,
+            'labels': label_ids,
+            'valid_ids': valid_ids,
+        }
+
+
+class TaskDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--data_dir', default='./data', type=str)
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_data', default='train.json', type=str)
+        parser.add_argument('--valid_data', default='dev.json', type=str)
+        parser.add_argument('--test_data', default='test.json', type=str)
+        parser.add_argument('--train_batchsize', default=16, type=int)
+        parser.add_argument('--valid_batchsize', default=32, type=int)
+        parser.add_argument('--max_seq_length', default=128, type=int)
+
+        parser.add_argument('--texta_name', default='text', type=str)
+        parser.add_argument('--textb_name', default='sentence2', type=str)
+        parser.add_argument('--label_name', default='label', type=str)
+        parser.add_argument('--id_name', default='id', type=str)
+
+        parser.add_argument('--dataset_name', default=None, type=str)
+        parser.add_argument('--vocab_file',
+                            type=str, default=None,
+                            help="Vocabulary mapping/file BERT was pretrainined on")
+        parser.add_argument("--do_lower_case",
+                            action='store_true',
+                            help="Set this flag if you are using an uncased model.")
+        parser.add_argument('--task_name', default='weibo', type=str)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        self.collator = TaskCollator()
+        self.collator.args = args
+        self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case)
+        self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer)
+
+        processors = {
+            'weibo': WeiboProcessor,
+            'resume': ResumeProcessor,
+            'msra': MSRAProcessor,
+            'ontonotes4': OntoNotes4Processor,
+            'cmeee': CMeEEProcessor,
+            'cluener': CLUENERProcessor,
+        }
+        if args.task_name not in processors:
+            raise ValueError("Task not found: %s" % (args.task_name))
+        processor = processors[args.task_name]()
+        # 生成id映射
+        label_list = processor.get_labels()
+        label2id = {label: i for i, label in enumerate(label_list, 1)}
+        label2id["[PAD]"] = 0
+        self.id2label = {v: k for k, v in label2id.items()}
+        self.collator.label2id = label2id
+
+        if args.dataset_name is None:
+            self.train_data = TaskDataset(os.path.join(
+                args.data_dir, args.train_data), processor, mode='train')
+            self.valid_data = TaskDataset(os.path.join(
+                args.data_dir, args.valid_data), processor, mode='dev')
+            self.test_data = TaskDataset(os.path.join(
+                args.data_dir, args.test_data), processor, mode='test')
+
+        else:
+            import datasets
+            ds = datasets.load_dataset(args.dataset_name)
+            self.train_data = ds['train']
+            self.valid_data = ds['validation']
+            self.test_data = ds['test']
+        self.save_hyperparameters(args)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+
+class LitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--markup', default='bios', type=str)
+        parser.add_argument('--middle_prefix', default='I-', type=str)
+        return parent_args
+
+    def __init__(self, args, id2label):
+        super().__init__()
+        # config = ZenConfig(os.path.join(args.pretrained_model_path, 'config.json'))
+        self.model = ZenForTokenClassification.from_pretrained(args.pretrained_model_path, num_labels=len(id2label))
+        self.seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix)
+        self.train_seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix)
+        self.id2label = id2label
+        self.label2id = {v: k for k, v in id2label.items()}
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss, _ = outputs
+        # logits = outputs.logits
+        # preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
+        # preds = preds.detach().cpu().numpy()
+        # labels = batch['labels'].detach().cpu().numpy()
+        # num_labels = len(self.label2id)
+        # y_true = []
+        # y_pred = []
+        # for i, label in enumerate(labels):
+        #     temp_1 = []
+        #     temp_2 = []
+        #     for j, m in enumerate(label):
+        #         if j == 0:
+        #             continue
+        #         elif labels[i][j] == num_labels - 1:
+        #             y_true.append(temp_1)
+        #             y_pred.append(temp_2)
+        #             break
+        #         else:
+        #             temp_1.append(self.id2label[labels[i][j]])
+        #             temp_2.append(self.id2label[preds[i][j]])
+
+        # self.train_seq_entity_score.update(y_true, y_pred)
+        # result = self.train_seq_entity_score.result()
+        # self.train_seq_entity_score.reset()
+        self.log('train_loss', loss)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss, logits = outputs
+        preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
+        preds = preds.detach().cpu().numpy()
+        labels = batch['labels'].detach().cpu().numpy()
+        num_labels = len(self.label2id)
+        y_true = []
+        y_pred = []
+        for i, label in enumerate(labels):
+            temp_1 = []
+            temp_2 = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif labels[i][j] == num_labels - 1:
+                    y_true.append(temp_1)
+                    y_pred.append(temp_2)
+                    break
+                else:
+                    temp_1.append(self.id2label[labels[i][j]])
+                    temp_2.append(self.id2label[preds[i][j]])
+
+        self.seq_entity_score.update(y_true, y_pred)
+        self.log('val_loss', loss)
+
+    def validation_epoch_end(self, outputs):
+        # compute metric for all process
+        score_dict, _ = self.seq_entity_score.result()
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            print('score_dict:\n', score_dict)
+        # reset the metric after once validation
+        self.seq_entity_score.reset()
+        for k, v in score_dict.items():
+            self.log('val_{}'.format(k), v)
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+class TaskModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./log/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename)
+
+
+def save_test(data, args, data_model):
+    with open(args.output_save_path, 'w', encoding='utf-8') as f:
+        idx = 0
+        for i in range(len(data)):
+            batch = data[i]
+            for sample in batch:
+                tmp_result = dict()
+                label_id = np.argmax(sample.numpy())
+                tmp_result['id'] = data_model.test_data.data[idx]['id']
+                tmp_result['label'] = data_model.id2label[label_id]
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data+'\n')
+                idx += 1
+    print('save the result to '+args.output_save_path)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--pretrained_model_path', default='', type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = TaskDataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = pl.Trainer.add_argparse_args(total_parser)
+    total_parser = TaskModelCheckpoint.add_argparse_args(total_parser)
+
+    # * Args for base model
+    from fengshen.models.model_utils import add_module_args
+    total_parser = add_module_args(total_parser)
+    total_parser = LitModel.add_model_specific_args(total_parser)
+
+    args = total_parser.parse_args()
+
+    checkpoint_callback = TaskModelCheckpoint(args).callbacks
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer.from_argparse_args(args,
+                                            callbacks=[checkpoint_callback, lr_monitor]
+                                            )
+
+    data_model = TaskDataModel(args)
+    id2label = data_model.id2label
+    print('id2label:', id2label)
+    model = LitModel(args, id2label)
+    trainer.fit(model, data_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/zen1_finetune/fs_zen1_tnews.sh b/fengshen/examples/zen1_finetune/fs_zen1_tnews.sh
new file mode 100644
index 0000000000000000000000000000000000000000..39f2b54063725514f3fd57fa56346a0796e26828
--- /dev/null
+++ b/fengshen/examples/zen1_finetune/fs_zen1_tnews.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#SBATCH --job-name=zen1_tnews # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='1'
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen1
+
+TASK=tnews
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test1.1.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name tnews \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.01 \
+        --warmup_ratio 0.01 \
+        --num_labels 15 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 400 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 400 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen1_finetune/ner_zen1_ontonotes4.sh b/fengshen/examples/zen1_finetune/ner_zen1_ontonotes4.sh
new file mode 100644
index 0000000000000000000000000000000000000000..be51a3f3d709d761b6dcb4e5759cc5b92a09a609
--- /dev/null
+++ b/fengshen/examples/zen1_finetune/ner_zen1_ontonotes4.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen1_base_ontonotes4 # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen1_base_ontonotes4/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='1'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen1_base
+
+TASK=ontonotes4
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/OntoNotes4/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 64 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --task_name ontonotes4 \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 200 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py b/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed400468cc3d0820d4b34385f270639014039ad1
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
@@ -0,0 +1,649 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from fengshen.models.zen2.modeling import ZenForSequenceClassification
+from fengshen.models.zen2.ngram_utils import ZenNgramDict
+from fengshen.models.zen2.tokenization import BertTokenizer
+from pytorch_lightning.callbacks import LearningRateMonitor
+import csv
+from dataclasses import dataclass
+import logging
+import math
+import numpy as np
+import os
+from tqdm import tqdm
+import json
+import torch
+import pytorch_lightning as pl
+import argparse
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import Dataset, DataLoader
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None, qid=0):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+        self.qid = qid
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id,
+                 ngram_ids, ngram_starts, ngram_lengths, ngram_tuples, ngram_seg_ids, ngram_masks, ngram_freqs,
+                 qid=-1):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.qid = qid
+
+        self.ngram_ids = ngram_ids
+        self.ngram_starts = ngram_starts
+        self.ngram_lengths = ngram_lengths
+        self.ngram_tuples = ngram_tuples
+        self.ngram_seg_ids = ngram_seg_ids
+        self.ngram_masks = ngram_masks
+        self.ngram_freqs = ngram_freqs
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_examples(self, data_path, mode):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                # if sys.version_info[0] == 2:
+                #     line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+    @classmethod
+    def _read_json(cls, input_file):
+        """Reads a jsonl file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            samples = []
+            for line in tqdm(lines):
+                data = json.loads(line)
+                samples.append(data)
+            return samples
+
+
+class TnewsProcessor(DataProcessor):
+    """Processor for the tnews data set (HIT version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_json(os.path.join(data_dir, "train.json")), "train")
+
+    def get_examples(self, data_path, mode):
+        return self._create_examples(
+            self._read_json(data_path),
+            set_type=mode
+        )
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # if i == 0:
+            #    continue
+            guid = "%s-%s" % (set_type, i)
+            # text_a = line[0]
+            text_a = line['sentence']
+            label = line['label'] if 'label' in line.keys() else None
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, label=label))
+        return examples
+
+
+class OcnliProcessor(DataProcessor):
+    """Processor for the ocnli or cmnli data set (HIT version)."""
+
+    def get_examples(self, data_path, mode):
+        return self._create_examples(
+            self._read_json(data_path),
+            set_type=mode
+        )
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # if i == 0:
+            #    continue
+            guid = "%s-%s" % (set_type, i)
+            # text_a = line[0]
+            text_a = line['sentence1']
+            text_b = line['sentence2']
+            label = line['label'] if 'label' in line.keys() else None
+            # 特殊处理，cmnli有label为-的
+            if label == '-':
+                label = None
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class IflytekProcessor(DataProcessor):
+    """Processor for the iflytek data set (HIT version)."""
+
+    def get_examples(self, data_path, mode):
+        return self._create_examples(
+            self._read_json(data_path),
+            set_type=mode
+        )
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # if i == 0:
+            #    continue
+            guid = "%s-%s" % (set_type, i)
+            # text_a = line[0]
+            text_a = line['sentence']
+            label = line['label'] if 'label' in line.keys() else None
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # label_map = {label : i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        # ----------- code for ngram BEGIN-----------
+        ngram_matches = []
+        #  Filter the word segment from 2 to max_ngram_len to check whether there is a word
+        max_gram_n = ngram_dict.max_ngram_len
+        for p in range(2, max_gram_n):
+            for q in range(0, len(tokens) - p + 1):
+                character_segment = tokens[q:q + p]
+                # j is the starting position of the word
+                # i is the length of the current word
+                character_segment = tuple(character_segment)
+                if character_segment in ngram_dict.ngram_to_id_dict:
+                    ngram_index = ngram_dict.ngram_to_id_dict[character_segment]
+                    ngram_freq = ngram_dict.ngram_to_freq_dict[character_segment]
+                    ngram_matches.append([ngram_index, q, p, character_segment, ngram_freq])
+
+        # shuffle(ngram_matches)
+        ngram_matches = sorted(ngram_matches, key=lambda s: s[0])
+        # max_word_in_seq_proportion = max_word_in_seq
+        max_word_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq)
+        if len(ngram_matches) > max_word_in_seq_proportion:
+            ngram_matches = ngram_matches[:max_word_in_seq_proportion]
+        ngram_ids = [ngram[0] for ngram in ngram_matches]
+        ngram_positions = [ngram[1] for ngram in ngram_matches]
+        ngram_lengths = [ngram[2] for ngram in ngram_matches]
+        ngram_tuples = [ngram[3] for ngram in ngram_matches]
+        ngram_freqs = [ngram[4] for ngram in ngram_matches]
+        ngram_seg_ids = [0 if position < len([id for id in segment_ids if id == 0]) else 1 for position in
+                         ngram_positions]
+
+        ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool)
+        ngram_mask_array[:len(ngram_ids)] = 1
+
+        # Zero-pad up to the max word in seq length.
+        padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids))
+        ngram_ids += padding
+        ngram_positions += padding
+        ngram_lengths += padding
+        ngram_seg_ids += padding
+        ngram_freqs += padding
+
+        # ----------- code for ngram END-----------
+
+        label_id = label_map[example.label] if example.label is not None else 0
+        # if ex_index < 5:
+        #     logger.info("*** Example ***")
+        #     logger.info("guid: %s" % (example.guid))
+        #     logger.info("tokens: %s" % " ".join(
+        #             [str(x) for x in tokens]))
+        #     logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        #     logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        #     logger.info(
+        #             "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        #     logger.info("label: %s (id = %d)" % (example.label, label_id))
+        #     logger.info("ngram_ids: %s" % " ".join([str(x) for x in ngram_ids]))
+        #     logger.info("ngram_positions: %s" % " ".join([str(x) for x in ngram_positions]))
+        #     logger.info("ngram_lengths: %s" % " ".join([str(x) for x in ngram_lengths]))
+        #     logger.info("ngram_tuples: %s" % " ".join([str(x) for x in ngram_tuples]))
+        #     logger.info("ngram_seg_ids: %s" % " ".join([str(x) for x in ngram_seg_ids]))
+        #     logger.info("ngram_freqs: %s" % " ".join([str(x) for x in ngram_freqs]))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id,
+                          ngram_ids=ngram_ids,
+                          ngram_starts=ngram_positions,
+                          ngram_lengths=ngram_lengths,
+                          ngram_tuples=ngram_tuples,
+                          ngram_seg_ids=ngram_seg_ids,
+                          ngram_masks=ngram_mask_array,
+                          ngram_freqs=ngram_freqs,
+                          qid=example.qid))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+class TaskDataset(Dataset):
+    def __init__(self, data_path, processor, mode='train'):
+        super().__init__()
+        self.data = self.load_data(data_path, processor, mode)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, data_path, processor, mode):
+        if mode == "train":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "test":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "dev":
+            examples = processor.get_examples(data_path, mode)
+        return examples
+
+
+@dataclass
+class TaskCollator:
+    args = None
+    tokenizer = None
+    ngram_dict = None
+    label2id = None
+
+    def __call__(self, samples):
+        features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict)
+        # logger.info("  Num examples = %d", len(samples))
+        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        # qids = torch.tensor([f.qid for f in features], dtype=torch.long)
+
+        ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long)
+        ngram_starts = torch.tensor([f.ngram_starts for f in features], dtype=torch.long)
+        ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long)
+        # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long)
+        # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long)
+        ngram_freqs = torch.tensor([f.ngram_freqs for f in features], dtype=torch.long)
+
+        batch_size = len(samples)
+        ngram_positions_matrix = torch.zeros(
+            size=(batch_size, self.args.max_seq_length, self.ngram_dict.max_ngram_in_seq),
+            dtype=torch.int)
+        for batch_id in range(batch_size):
+            ngram_id = ngram_ids[batch_id]
+            ngram_start = ngram_starts[batch_id]
+            ngram_length = ngram_lengths[batch_id]
+            for i in range(len(ngram_id)):
+                ngram_positions_matrix[batch_id][ngram_start[i]:ngram_start[i] + ngram_length[i], i] = ngram_freqs[batch_id][i]
+            ngram_positions_matrix[batch_id] \
+                = torch.div(ngram_positions_matrix[batch_id],
+                            torch.stack([torch.sum(ngram_positions_matrix[batch_id], 1)] *
+                                        ngram_positions_matrix[batch_id].size(1)).t() + 1e-10)
+
+        return {
+            'input_ids': input_ids,
+            'input_ngram_ids': ngram_ids,
+            'ngram_position_matrix': ngram_positions_matrix,
+            'attention_mask': input_mask,
+            'token_type_ids': segment_ids,
+            'labels': label_ids
+
+        }
+
+        # return default_collate(sample_list)
+
+
+class TaskDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--data_dir', default='./data', type=str)
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_data', default='train.json', type=str)
+        parser.add_argument('--valid_data', default='dev.json', type=str)
+        parser.add_argument('--test_data', default='test.json', type=str)
+        parser.add_argument('--train_batchsize', default=16, type=int)
+        parser.add_argument('--valid_batchsize', default=32, type=int)
+        parser.add_argument('--max_seq_length', default=128, type=int)
+
+        parser.add_argument('--texta_name', default='text', type=str)
+        parser.add_argument('--textb_name', default='sentence2', type=str)
+        parser.add_argument('--label_name', default='label', type=str)
+        parser.add_argument('--id_name', default='id', type=str)
+
+        parser.add_argument('--dataset_name', default=None, type=str)
+        parser.add_argument('--vocab_file',
+                            type=str, default=None,
+                            help="Vocabulary mapping/file BERT was pretrainined on")
+        parser.add_argument("--do_lower_case",
+                            action='store_true',
+                            help="Set this flag if you are using an uncased model.")
+        parser.add_argument('--task_name', default='tnews', type=str)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        self.collator = TaskCollator()
+        self.collator.args = args
+        self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case)
+        self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer)
+
+        processors = {
+            'afqmc': OcnliProcessor,
+            'tnews': TnewsProcessor,
+            'ocnli': OcnliProcessor,
+            'cmnli': OcnliProcessor,
+            'iflytek': IflytekProcessor,
+        }
+        if args.task_name not in processors:
+            raise ValueError("Task not found: %s" % (args.task_name))
+        processor = processors[args.task_name]()
+        if args.dataset_name is None:
+            self.label2id, self.id2label = self.load_schema(os.path.join(
+                args.data_dir, args.train_data), args)
+            self.train_data = TaskDataset(os.path.join(
+                args.data_dir, args.train_data), processor, mode='train')
+            self.valid_data = TaskDataset(os.path.join(
+                args.data_dir, args.valid_data), processor, mode='dev')
+            self.test_data = TaskDataset(os.path.join(
+                args.data_dir, args.test_data), processor, mode='test')
+            self.collator.label2id = self.label2id
+        else:
+            import datasets
+            ds = datasets.load_dataset(args.dataset_name)
+            self.train_data = ds['train']
+            self.valid_data = ds['validation']
+            self.test_data = ds['test']
+        self.save_hyperparameters(args)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def load_schema(self, data_path, args):
+        with open(data_path, 'r', encoding='utf8') as f:
+            lines = f.readlines()
+            label_list = []
+            for line in tqdm(lines):
+                data = json.loads(line)
+                labels = data[args.label_name] if args.label_name in data.keys(
+                ) else 0
+                if labels not in label_list:
+                    label_list.append(labels)
+
+        label2id, id2label = {}, {}
+        for i, k in enumerate(label_list):
+            label2id[k] = i
+            id2label[i] = k
+        return label2id, id2label
+
+
+class LitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--num_labels', default=2, type=int)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.model = ZenForSequenceClassification.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels)
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        loss, logits = self.model(**batch)
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('train_loss', loss)
+        self.log('train_acc', acc)
+        return loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).float()
+        corr = torch.eq(y_pred, y_true)
+        acc = torch.sum(corr.float())/labels.size()[0]
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        loss, logits = self.model(**batch)
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('val_loss', loss)
+        self.log('val_acc', acc)
+
+    def predict_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        return output.logits
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+class TaskModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./log/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename)
+
+
+def save_test(data, args, data_model):
+    with open(args.output_save_path, 'w', encoding='utf-8') as f:
+        idx = 0
+        for i in range(len(data)):
+            batch = data[i]
+            for sample in batch:
+                tmp_result = dict()
+                label_id = np.argmax(sample.numpy())
+                tmp_result['id'] = data_model.test_data.data[idx]['id']
+                tmp_result['label'] = data_model.id2label[label_id]
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data+'\n')
+                idx += 1
+    print('save the result to '+args.output_save_path)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--pretrained_model_path', default='', type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = TaskDataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = pl.Trainer.add_argparse_args(total_parser)
+    total_parser = TaskModelCheckpoint.add_argparse_args(total_parser)
+
+    # * Args for base model
+    from fengshen.models.model_utils import add_module_args
+    total_parser = add_module_args(total_parser)
+    total_parser = LitModel.add_model_specific_args(total_parser)
+
+    args = total_parser.parse_args()
+
+    checkpoint_callback = TaskModelCheckpoint(args).callbacks
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer.from_argparse_args(args,
+                                            callbacks=[checkpoint_callback, lr_monitor]
+                                            )
+
+    data_model = TaskDataModel(args)
+    model = LitModel(args)
+    trainer.fit(model, data_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py b/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..619847c1555311226be69d7d0558368dfd048546
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
@@ -0,0 +1,678 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from fengshen.models.zen2.modeling import ZenForTokenClassification
+from fengshen.metric.metric import SeqEntityScore
+from fengshen.models.zen2.tokenization import BertTokenizer
+from fengshen.models.zen2.ngram_utils import ZenNgramDict
+from pytorch_lightning.callbacks import LearningRateMonitor
+from dataclasses import dataclass
+import logging
+import math
+import numpy as np
+import os
+import json
+import torch
+import pytorch_lightning as pl
+import argparse
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import Dataset, DataLoader
+
+import torch.nn.functional as F
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.ERROR)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths,
+                 ngram_tuples, ngram_seg_ids, ngram_masks, valid_ids=None, label_mask=None, b_use_valid_filter=False):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.valid_ids = valid_ids
+        self.label_mask = label_mask
+
+        self.ngram_ids = ngram_ids
+        self.ngram_positions = ngram_positions
+        self.ngram_lengths = ngram_lengths
+        self.ngram_tuples = ngram_tuples
+        self.ngram_seg_ids = ngram_seg_ids
+        self.ngram_masks = ngram_masks
+
+        self.b_use_valid_filter = b_use_valid_filter
+
+
+def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # label_map = {label: i for i, label in enumerate(label_list, 1)}
+    # label_map["[PAD]"] = 0
+
+    features = []
+    b_use_valid_filter = False
+    for (ex_index, example) in enumerate(examples):
+        textlist = example.text_a
+        labellist = example.label
+        tokens = []
+        labels = []
+        valid = []
+        label_mask = []
+        for i, word in enumerate(textlist):
+            token = tokenizer.tokenize(word)
+            if len(tokens) + len(token) > max_seq_length - 2:
+                break
+            tokens.extend(token)
+            label_1 = labellist[i]
+            for m in range(len(token)):
+                if m == 0:
+                    labels.append(label_1)
+                    valid.append(1)
+                    label_mask.append(1)
+                else:
+                    valid.append(0)
+                    b_use_valid_filter = True
+        ntokens = []
+        segment_ids = []
+        label_ids = []
+        ntokens.append("[CLS]")
+        segment_ids.append(0)
+        valid.insert(0, 1)
+        label_mask.insert(0, 1)
+        label_ids.append(label_map["[CLS]"])
+        for i, token in enumerate(tokens):
+            ntokens.append(token)
+            segment_ids.append(0)
+            if len(labels) > i:
+                label_ids.append(label_map[labels[i]])
+        ntokens.append("[SEP]")
+        segment_ids.append(0)
+        valid.append(1)
+        label_mask.append(1)
+        label_ids.append(label_map["[SEP]"])
+        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
+        input_mask = [1] * len(input_ids)
+        label_mask = [1] * len(label_ids)
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+            label_ids.append(0)
+            valid.append(1)
+            label_mask.append(0)
+        while len(label_ids) < max_seq_length:
+            label_ids.append(0)
+            label_mask.append(0)
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        assert len(label_ids) == max_seq_length
+        assert len(valid) == max_seq_length
+        assert len(label_mask) == max_seq_length
+
+        # ----------- code for ngram BEGIN-----------
+        ngram_matches = []
+        #  Filter the ngram segment from 2 to 7 to check whether there is a ngram
+        max_gram_n = ngram_dict.max_ngram_len
+        for p in range(2, max_gram_n):
+            for q in range(0, len(tokens) - p + 1):
+                character_segment = tokens[q:q + p]
+                # j is the starting position of the ngram
+                # i is the length of the current ngram
+                character_segment = tuple(character_segment)
+                if character_segment in ngram_dict.ngram_to_id_dict:
+                    ngram_index = ngram_dict.ngram_to_id_dict[character_segment]
+                    ngram_freq = ngram_dict.ngram_to_freq_dict[character_segment]
+                    ngram_matches.append([ngram_index, q, p, character_segment, ngram_freq])
+
+        ngram_matches = sorted(ngram_matches, key=lambda s: s[0])
+
+        max_ngram_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq)
+        if len(ngram_matches) > max_ngram_in_seq_proportion:
+            ngram_matches = ngram_matches[:max_ngram_in_seq_proportion]
+
+        ngram_ids = [ngram[0] for ngram in ngram_matches]
+        ngram_positions = [ngram[1] for ngram in ngram_matches]
+        ngram_lengths = [ngram[2] for ngram in ngram_matches]
+        ngram_tuples = [ngram[3] for ngram in ngram_matches]
+        ngram_freqs = [ngram[4] for ngram in ngram_matches]
+        ngram_seg_ids = [0 if position < (len(tokens) + 2) else 1 for position in ngram_positions]
+
+        ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool)
+        ngram_mask_array[:len(ngram_ids)] = 1
+
+        # record the masked positions
+        ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32)
+        for i in range(len(ngram_ids)):
+            ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = ngram_freqs[i]
+        ngram_positions_matrix = torch.from_numpy(ngram_positions_matrix.astype(np.float))
+        ngram_positions_matrix = torch.div(ngram_positions_matrix, torch.stack(
+            [torch.sum(ngram_positions_matrix, 1)] * ngram_positions_matrix.size(1)).t() + 1e-10)
+        ngram_positions_matrix = ngram_positions_matrix.numpy()
+
+        # Zero-pad up to the max ngram in seq length.
+        padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids))
+        ngram_ids += padding
+        ngram_lengths += padding
+        ngram_seg_ids += padding
+
+        # ----------- code for ngram END-----------
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %s)" % (",".join([str(x) for x in example.label]), ",".join([str(x) for x in label_ids])))
+            logger.info("valid: %s" % " ".join([str(x) for x in valid]))
+            logger.info("b_use_valid_filter: %s" % str(b_use_valid_filter))
+            logger.info("ngram_ids: %s" % " ".join([str(x) for x in ngram_ids]))
+            logger.info("ngram_positions: %s" % " ".join([str(x) for x in ngram_positions]))
+            logger.info("ngram_lengths: %s" % " ".join([str(x) for x in ngram_lengths]))
+            logger.info("ngram_tuples: %s" % " ".join([str(x) for x in ngram_tuples]))
+            logger.info("ngram_seg_ids: %s" % " ".join([str(x) for x in ngram_seg_ids]))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_ids,
+                          ngram_ids=ngram_ids,
+                          ngram_positions=ngram_positions_matrix,
+                          ngram_lengths=ngram_lengths,
+                          ngram_tuples=ngram_tuples,
+                          ngram_seg_ids=ngram_seg_ids,
+                          ngram_masks=ngram_mask_array,
+                          valid_ids=valid,
+                          label_mask=label_mask,
+                          b_use_valid_filter=b_use_valid_filter))
+    return features
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_examples(self, data_path, set_type, quotechar=' '):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(data_path, self.get_quotechar()), set_type)
+
+    def _create_examples(self, lines, set_type):
+        examples = []
+        for i, (sentence, label) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = sentence
+            label = label
+            examples.append(InputExample(guid=guid, text_a=text_a, label=label))
+        return examples
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def get_quotechar(self):
+        return ' '
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        '''
+        read file
+        return format :
+        [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
+        '''
+        f = open(input_file)
+        data = []
+        sentence = []
+        label = []
+        for line in f:
+            if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
+                if len(sentence) > 0:
+                    data.append((sentence, label))
+                    sentence = []
+                    label = []
+                continue
+            splits = line.split(quotechar)
+            sentence.append(splits[0])
+            label.append(splits[-1][:-1])
+
+        if len(sentence) > 0:
+            data.append((sentence, label))
+            sentence = []
+            label = []
+        return data
+
+
+class MSRAProcessor(DataProcessor):
+    """Processor for the msra data set."""
+
+    def get_labels(self):
+        return ['B-NR', 'B-NS', 'B-NT', 'E-NR', 'E-NS', 'E-NT', 'M-NR',
+                'M-NS', 'M-NT', 'O', 'S-NR', 'S-NS', 'S-NT', '[CLS]', '[SEP]']
+
+
+class OntoNotes4Processor(DataProcessor):
+    """Processor for the OntoNotes4 data set."""
+
+    def get_labels(self):
+        return ['B-GPE', 'B-LOC', 'B-ORG', 'B-PER', 'E-GPE', 'E-LOC',
+                'E-ORG', 'E-PER', 'M-GPE', 'M-LOC', 'M-ORG', 'M-PER', 'O',
+                'S-GPE', 'S-LOC', 'S-ORG', 'S-PER', '[CLS]', '[SEP]']
+
+
+class WeiboProcessor(DataProcessor):
+    """Processor for the Weibo data set."""
+
+    def get_labels(self):
+        return ['B-GPE.NAM', 'B-GPE.NOM', 'B-LOC.NAM', 'B-LOC.NOM',
+                'B-ORG.NAM', 'B-ORG.NOM', 'B-PER.NAM', 'B-PER.NOM', 'E-GPE.NAM',
+                'E-GPE.NOM', 'E-LOC.NAM', 'E-LOC.NOM', 'E-ORG.NAM', 'E-ORG.NOM',
+                'E-PER.NAM', 'E-PER.NOM', 'M-GPE.NAM', 'M-LOC.NAM', 'M-LOC.NOM',
+                'M-ORG.NAM', 'M-ORG.NOM', 'M-PER.NAM', 'M-PER.NOM', 'O',
+                'S-GPE.NAM', 'S-LOC.NOM', 'S-PER.NAM', 'S-PER.NOM', '[CLS]', '[SEP]']
+
+
+class ResumeProcessor(DataProcessor):
+    """Processor for the resume data set."""
+
+    def get_labels(self):
+        return ['B-CONT', 'B-EDU', 'B-LOC', 'B-NAME', 'B-ORG', 'B-PRO',
+                'B-RACE', 'B-TITLE', 'E-CONT', 'E-EDU', 'E-LOC', 'E-NAME',
+                'E-ORG', 'E-PRO', 'E-RACE', 'E-TITLE', 'M-CONT', 'M-EDU',
+                'M-LOC', 'M-NAME', 'M-ORG', 'M-PRO', 'M-RACE', 'M-TITLE',
+                'O', 'S-NAME', 'S-ORG', 'S-RACE', '[CLS]', '[SEP]']
+
+
+class CMeEEProcessor(DataProcessor):
+    """Processor for the CMeEE data set."""
+
+    def get_quotechar(self):
+        return '\t'
+
+    def get_labels(self):
+        return ['B-临床表现', 'B-医学检验项目', 'B-医疗程序', 'B-医疗设备',
+                'B-微生物类', 'B-疾病', 'B-科室', 'B-药物', 'B-身体', 'I-临床表现',
+                'I-医学检验项目', 'I-医疗程序', 'I-医疗设备', 'I-微生物类',
+                'I-疾病', 'I-科室', 'I-药物', 'I-身体', 'O', '[CLS]', '[SEP]']
+
+
+class CLUENERProcessor(DataProcessor):
+    """Processor for the CLUENER data set."""
+
+    def get_quotechar(self):
+        return '\t'
+
+    def get_labels(self):
+        return ['B-书名', 'B-公司', 'B-地址', 'B-姓名', 'B-政府', 'B-景点',
+                'B-游戏', 'B-电影', 'B-组织机构', 'B-职位', 'I-书名', 'I-公司',
+                'I-地址', 'I-姓名', 'I-政府', 'I-景点', 'I-游戏', 'I-电影',
+                'I-组织机构', 'I-职位', 'O', '[CLS]', '[SEP]']
+
+
+class TaskDataset(Dataset):
+    def __init__(self, data_path, processor, mode='train'):
+        super().__init__()
+        self.data = self.load_data(data_path, processor, mode)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def load_data(self, data_path, processor, mode):
+        if mode == "train":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "test":
+            examples = processor.get_examples(data_path, mode)
+        elif mode == "dev":
+            examples = processor.get_examples(data_path, mode)
+        return examples
+
+
+@dataclass
+class TaskCollator:
+    args = None
+    tokenizer = None
+    ngram_dict = None
+    label2id = None
+
+    def __call__(self, samples):
+        features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict)
+        # logger.info("  Num examples = %d", len(samples))
+
+        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long)
+
+        ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long)
+        ngram_positions = torch.tensor([f.ngram_positions for f in features], dtype=torch.long)
+        # ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long)
+        # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long)
+        # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long)
+
+        # label_mask = torch.tensor([f.label_mask for f in features], dtype=torch.long)
+        b_use_valid_filter = torch.tensor([f.b_use_valid_filter for f in features], dtype=torch.bool)
+        # 取第一个出来？
+        # b_use_valid_filter = b_use_valid_filter.detach().cpu().numpy()[0]
+        b_use_valid_filter = b_use_valid_filter[0]
+        return {
+            'input_ids': input_ids,
+            'input_ngram_ids': ngram_ids,
+            'ngram_position_matrix': ngram_positions,
+            'attention_mask': input_mask,
+            'token_type_ids': segment_ids,
+            'labels': label_ids,
+            'valid_ids': valid_ids,
+            'b_use_valid_filter': b_use_valid_filter,
+        }
+
+
+class TaskDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--data_dir', default='./data', type=str)
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--train_data', default='train.json', type=str)
+        parser.add_argument('--valid_data', default='dev.json', type=str)
+        parser.add_argument('--test_data', default='test.json', type=str)
+        parser.add_argument('--train_batchsize', default=16, type=int)
+        parser.add_argument('--valid_batchsize', default=32, type=int)
+        parser.add_argument('--max_seq_length', default=128, type=int)
+
+        parser.add_argument('--texta_name', default='text', type=str)
+        parser.add_argument('--textb_name', default='sentence2', type=str)
+        parser.add_argument('--label_name', default='label', type=str)
+        parser.add_argument('--id_name', default='id', type=str)
+
+        parser.add_argument('--dataset_name', default=None, type=str)
+        parser.add_argument('--vocab_file',
+                            type=str, default=None,
+                            help="Vocabulary mapping/file BERT was pretrainined on")
+        parser.add_argument("--do_lower_case",
+                            action='store_true',
+                            help="Set this flag if you are using an uncased model.")
+        parser.add_argument('--task_name', default='weibo', type=str)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__()
+        self.train_batchsize = args.train_batchsize
+        self.valid_batchsize = args.valid_batchsize
+        self.collator = TaskCollator()
+        self.collator.args = args
+        self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case)
+        self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer)
+
+        processors = {
+            'weibo': WeiboProcessor,
+            'resume': ResumeProcessor,
+            'msra': MSRAProcessor,
+            'ontonotes4': OntoNotes4Processor,
+            'cmeee': CMeEEProcessor,
+            'cluener': CLUENERProcessor,
+        }
+        if args.task_name not in processors:
+            raise ValueError("Task not found: %s" % (args.task_name))
+        processor = processors[args.task_name]()
+        # 生成id映射
+        label_list = processor.get_labels()
+        label2id = {label: i for i, label in enumerate(label_list, 1)}
+        label2id["[PAD]"] = 0
+        self.id2label = {v: k for k, v in label2id.items()}
+        self.collator.label2id = label2id
+
+        if args.dataset_name is None:
+            self.train_data = TaskDataset(os.path.join(
+                args.data_dir, args.train_data), processor, mode='train')
+            self.valid_data = TaskDataset(os.path.join(
+                args.data_dir, args.valid_data), processor, mode='dev')
+            self.test_data = TaskDataset(os.path.join(
+                args.data_dir, args.test_data), processor, mode='test')
+
+        else:
+            import datasets
+            ds = datasets.load_dataset(args.dataset_name)
+            self.train_data = ds['train']
+            self.valid_data = ds['validation']
+            self.test_data = ds['test']
+        self.save_hyperparameters(args)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False,
+                          collate_fn=self.collator)
+
+
+class LitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+        parser.add_argument('--markup', default='bios', type=str)
+        parser.add_argument('--middle_prefix', default='I-', type=str)
+        return parent_args
+
+    def __init__(self, args, id2label):
+        super().__init__()
+        # config = ZenConfig(os.path.join(args.pretrained_model_path, 'config.json'))
+        self.model = ZenForTokenClassification.from_pretrained(args.pretrained_model_path, num_labels=len(id2label))
+        self.seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix)
+        self.train_seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix)
+        self.id2label = id2label
+        self.label2id = {v: k for k, v in id2label.items()}
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        # logits = outputs.logits
+        # preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
+        # preds = preds.detach().cpu().numpy()
+        # labels = batch['labels'].detach().cpu().numpy()
+        # num_labels = len(self.label2id)
+        # y_true = []
+        # y_pred = []
+        # for i, label in enumerate(labels):
+        #     temp_1 = []
+        #     temp_2 = []
+        #     for j, m in enumerate(label):
+        #         if j == 0:
+        #             continue
+        #         elif labels[i][j] == num_labels - 1:
+        #             y_true.append(temp_1)
+        #             y_pred.append(temp_2)
+        #             break
+        #         else:
+        #             temp_1.append(self.id2label[labels[i][j]])
+        #             temp_2.append(self.id2label[preds[i][j]])
+
+        # self.train_seq_entity_score.update(y_true, y_pred)
+        # result = self.train_seq_entity_score.result()
+        # self.train_seq_entity_score.reset()
+        self.log('train_loss', loss)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.logits
+        preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
+        preds = preds.detach().cpu().numpy()
+        labels = batch['labels'].detach().cpu().numpy()
+        num_labels = len(self.label2id)
+        y_true = []
+        y_pred = []
+        for i, label in enumerate(labels):
+            temp_1 = []
+            temp_2 = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif labels[i][j] == num_labels - 1:
+                    y_true.append(temp_1)
+                    y_pred.append(temp_2)
+                    break
+                else:
+                    temp_1.append(self.id2label[labels[i][j]])
+                    temp_2.append(self.id2label[preds[i][j]])
+
+        self.seq_entity_score.update(y_true, y_pred)
+        self.log('val_loss', loss)
+
+    def validation_epoch_end(self, outputs):
+        # compute metric for all process
+        score_dict, _ = self.seq_entity_score.result()
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            print('score_dict:\n', score_dict)
+        # reset the metric after once validation
+        self.seq_entity_score.reset()
+        for k, v in score_dict.items():
+            self.log('val_{}'.format(k), v)
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+class TaskModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--dirpath', default='./log/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.dirpath,
+                                         filename=args.filename)
+
+
+def save_test(data, args, data_model):
+    with open(args.output_save_path, 'w', encoding='utf-8') as f:
+        idx = 0
+        for i in range(len(data)):
+            batch = data[i]
+            for sample in batch:
+                tmp_result = dict()
+                label_id = np.argmax(sample.numpy())
+                tmp_result['id'] = data_model.test_data.data[idx]['id']
+                tmp_result['label'] = data_model.id2label[label_id]
+                json_data = json.dumps(tmp_result, ensure_ascii=False)
+                f.write(json_data+'\n')
+                idx += 1
+    print('save the result to '+args.output_save_path)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--pretrained_model_path', default='', type=str)
+    total_parser.add_argument('--output_save_path',
+                              default='./predict.json', type=str)
+    # * Args for data preprocessing
+    total_parser = TaskDataModel.add_data_specific_args(total_parser)
+    # * Args for training
+    total_parser = pl.Trainer.add_argparse_args(total_parser)
+    total_parser = TaskModelCheckpoint.add_argparse_args(total_parser)
+
+    # * Args for base model
+    from fengshen.models.model_utils import add_module_args
+    total_parser = add_module_args(total_parser)
+    total_parser = LitModel.add_model_specific_args(total_parser)
+
+    args = total_parser.parse_args()
+
+    checkpoint_callback = TaskModelCheckpoint(args).callbacks
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer.from_argparse_args(args,
+                                            callbacks=[checkpoint_callback, lr_monitor]
+                                            )
+
+    data_model = TaskDataModel(args)
+    id2label = data_model.id2label
+    print('id2label:', id2label)
+    model = LitModel(args, id2label)
+    trainer.fit(model, data_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_afqmc.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_afqmc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7143e61be485f0d6dc2d7912b5b30250df408b75
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_base_afqmc.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_afqmc # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=afqmc
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+# PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name afqmc \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 2 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_cmnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_cmnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6f4f7e9eec1d11a2bf1d09f8d57303ca139f8e2
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_base_cmnli.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_cmnli # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='4'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=cmnli
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/cmnli_public/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 64 \
+        --valid_batchsize 32 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name cmnli \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_iflytek.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_iflytek.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9171a7c3264a856915fd9147096f097b8ebd43c8
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_base_iflytek.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_iflytek # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='0'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=iflytek
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name iflytek \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 119 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_ocnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_ocnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f635330a4b260391a3f9d4b01998ce8305d55b8e
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_base_ocnli.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_ocnli # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='1'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=ocnli
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name ocnli \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_tnews.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_tnews.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dee88afbe2639a514745771538d6c0d40e8d3329
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_base_tnews.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_tnews # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=tnews
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+# PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test1.1.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name tnews \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.01 \
+        --warmup_ratio 0.01 \
+        --num_labels 15 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 400 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 400 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_afqmc.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_afqmc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1f44844a127b5bb39226c56b70bba85957dd735a
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_large_afqmc.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_afqmc # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='1'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=afqmc
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name afqmc \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 2 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_cmnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_cmnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b2d6dfff35668596c0c748003b7b937d98604922
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_large_cmnli.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_cmnli # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='3'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=cmnli
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/cmnli_public/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 32 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name cmnli \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_iflytek.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_iflytek.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7afd7b24d27ddd1a6834935222a100351111d570
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_large_iflytek.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_iflytek # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='5'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=iflytek
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name iflytek \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 119 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 7 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_ocnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_ocnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5598ee8027a9bc41c4c196d71d98341557e0f4eb
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_large_ocnli.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_ocnli # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+export CUDA_VISIBLE_DEVICES='6'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=ocnli
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name ocnli \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --num_labels 3 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_tnews.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_tnews.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ec081cd3191f951c3815af423329540a219b0114
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/fs_zen2_large_tnews.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_tnews # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=tnews
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.json \
+        --valid_data dev.json \
+        --test_data test1.1.json \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 128 \
+        --texta_name sentence \
+        --label_name label \
+        --id_name id \
+        --task_name tnews \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 2e-5 \
+        --weight_decay 0.01 \
+        --warmup_ratio 0.01 \
+        --num_labels 15 \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_acc \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 400 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_acc:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 400 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_cluener.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_cluener.sh
new file mode 100644
index 0000000000000000000000000000000000000000..04b97b5fe5123af3170523dfde0ae008a78b2428
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_base_cluener.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_cluener # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_cluener/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=cluener
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CLUENER/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.txt \
+        --valid_data dev.char.txt \
+        --test_data dev.char.txt \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name cluener \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bio \
+        --middle_prefix I- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_cmeee.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_cmeee.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a4be7221a250030db4cf1b7d157f1d6c0fd4b0f0
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_base_cmeee.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_cmeee # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=2 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:2 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/lujunyu/experiments/ner_finetune/zen2_base_cmeee/%x-%j.log # output and error file name (%x=job name, %j=job id)
+#SBATCH -p hgx
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/lujunyu/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=cmeee
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/lujunyu/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CMeEE_copy/
+PRETRAINED_MODEL_PATH=/cognitive_comp/lujunyu/pretrain_models/zen2-base-med
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bio \
+        --valid_data dev.char.bio \
+        --test_data dev.char.bio \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 512 \
+        --task_name cmeee \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bio \
+        --middle_prefix I- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 10 \
+        --gpus 2 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 0.25 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/lujunyu/Fengshenbang-LM-Git/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+srun python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_msra.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_msra.sh
new file mode 100644
index 0000000000000000000000000000000000000000..397c3ea6adc3d9f275389509aa41d0e4050b3c14
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_base_msra.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_msra # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_msra/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=msra
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/MSRA/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train_dev.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name msra \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 800 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 800 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_ontonotes4.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_ontonotes4.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1e1237967712a6862e5770e90d4e8db8d074d320
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_base_ontonotes4.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_ontonotes4 # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_ontonotes4/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=ontonotes4
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/OntoNotes4/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name ontonotes4 \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 200 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_resume.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_resume.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a7aee577ed035c0f39b883aa8a2a4dd6fffd479d
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_base_resume.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_resume # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_resume/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=resume
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/Resume/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name resume \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_weibo.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_weibo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b3f4667e59fe0b7ba98f37dec65e12fdf6faf555
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_base_weibo.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_base_weibo # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_weibo/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_base
+
+TASK=weibo
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.all.bmes \
+        --valid_data test.all.bmes \
+        --test_data test.all.bmes \
+        --train_batchsize 32 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name weibo \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 20 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_cluener.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_cluener.sh
new file mode 100644
index 0000000000000000000000000000000000000000..07193e3f15ca69755853623a57fee0a573db6593
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_large_cluener.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_cluener # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_cluener/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=cluener
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CLUENER/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.txt \
+        --valid_data dev.char.txt \
+        --test_data dev.char.txt \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name cluener \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bio \
+        --middle_prefix I- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 200 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_cmeee.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_cmeee.sh
new file mode 100644
index 0000000000000000000000000000000000000000..02409b04501bf6155481673b3acd0bd22914d3f3
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_large_cmeee.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_cmeee # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_cmeee/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=cmeee
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CMeEE/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bio \
+        --valid_data dev.char.bio \
+        --test_data dev.char.bio \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name cmeee \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bio \
+        --middle_prefix I- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 200 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_msra.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_msra.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cef8f1f70babc94ed77dc585fbba47f5b45ff7a5
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_large_msra.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_msra # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_msra/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=msra
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/MSRA/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train_dev.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name msra \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 800 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 800 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_ontonotes4.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_ontonotes4.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f8bb41316b4cec4bb94fa36ac9bc39c9f3ce41f8
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_large_ontonotes4.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_ontonotes4 # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_ontonotes4/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=ontonotes4
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/OntoNotes4/
+PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name ontonotes4 \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 200 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 200 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_resume.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_resume.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e21a61f48a96f1d831c90d3cbc3a9cbe8eb7de38
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_large_resume.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_resume # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_resume/%x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=resume
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/Resume/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.char.bmes \
+        --valid_data test.char.bmes \
+        --test_data test.char.bmes \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name resume \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 100 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_weibo.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_weibo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7fab2998437ef8c12dcd93466371d0324eec4c79
--- /dev/null
+++ b/fengshen/examples/zen2_finetune/ner_zen2_large_weibo.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name=zen2_large_weibo # create a short name for your job
+#SBATCH --nodes=1 # node count
+#SBATCH --ntasks=1 # total number of tasks across all nodes
+#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
+#SBATCH --gres=gpu:1 # number of gpus per node
+#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. 
+#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id)
+
+
+# export CUDA_VISIBLE_DEVICES='2'
+export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions
+
+MODEL_NAME=zen2_large
+
+TASK=weibo
+
+ZERO_STAGE=1
+STRATEGY=deepspeed_stage_${ZERO_STAGE}
+
+ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK}
+if [ ! -d ${ROOT_DIR} ];then
+  mkdir -p ${ROOT_DIR}
+  echo ${ROOT_DIR} created!!!!!!!!!!!!!!
+else
+  echo ${ROOT_DIR} exist!!!!!!!!!!!!!!!
+fi
+
+DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo/
+PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0
+
+CHECKPOINT_PATH=${ROOT_DIR}/ckpt/
+OUTPUT_PATH=${ROOT_DIR}/predict.json
+
+DATA_ARGS="\
+        --data_dir $DATA_DIR \
+        --train_data train.all.bmes \
+        --valid_data test.all.bmes \
+        --test_data test.all.bmes \
+        --train_batchsize 16 \
+        --valid_batchsize 16 \
+        --max_seq_length 256 \
+        --task_name weibo \
+        "
+
+MODEL_ARGS="\
+        --learning_rate 3e-5 \
+        --weight_decay 0.1 \
+        --warmup_ratio 0.01 \
+        --markup bioes \
+        --middle_prefix M- \
+        "
+
+MODEL_CHECKPOINT_ARGS="\
+        --monitor val_f1 \
+        --save_top_k 3 \
+        --mode max \
+        --every_n_train_steps 100 \
+        --save_weights_only True \
+        --dirpath $CHECKPOINT_PATH \
+        --filename model-{epoch:02d}-{val_f1:.4f} \
+        "
+
+TRAINER_ARGS="\
+        --max_epochs 30 \
+        --gpus 1 \
+        --check_val_every_n_epoch 1 \
+        --val_check_interval 20 \
+        --default_root_dir $ROOT_DIR \
+        "
+
+
+options=" \
+        --pretrained_model_path $PRETRAINED_MODEL_PATH \
+        --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \
+        --do_lower_case \
+        --output_save_path $OUTPUT_PATH \
+        $DATA_ARGS \
+        $MODEL_ARGS \
+        $MODEL_CHECKPOINT_ARGS \
+        $TRAINER_ARGS \
+"
+SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py
+/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
+# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif
+# python3 $SCRIPT_PATH $options
+# source activate base
+# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options
+
diff --git a/fengshen/metric/metric.py b/fengshen/metric/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..5588db3726e30fbc955e619ecd24de3c2c5a1952
--- /dev/null
+++ b/fengshen/metric/metric.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+from collections import Counter
+import torch
+from torch import nn
+# import seqeval
+
+from .utils_ner import get_entities
+
+
+class metrics_mlm_acc(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, logits, labels, masked_lm_metric):
+
+        # if len(list(logits.shape))==3:
+        mask_label_size = 0
+        for i in masked_lm_metric:
+            for j in i:
+                if j > 0:
+                    mask_label_size += 1
+
+        y_pred = torch.argmax(logits, dim=-1)
+
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,))
+        masked_lm_metric = masked_lm_metric.view(size=(-1,))
+
+        corr = torch.eq(y_pred, y_true)
+        corr = torch.multiply(masked_lm_metric, corr)
+
+        acc = torch.sum(corr.float())/mask_label_size
+        return acc
+
+
+class EntityScore(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.origins = []
+        self.founds = []
+        self.rights = []
+
+    def compute(self, origin, found, right):
+        recall = 0 if origin == 0 else (right / origin)
+        precision = 0 if found == 0 else (right / found)
+        f1 = 0. if recall + precision == 0 else (2 * precision * recall) / (precision + recall)
+        return recall, precision, f1
+
+    def result(self):
+        class_info = {}
+        
+        origin_counter = Counter([x[0] for x in self.origins])
+        found_counter = Counter([x[0] for x in self.founds])
+        right_counter = Counter([x[0] for x in self.rights])
+        for type_, count in origin_counter.items():
+            origin = count
+            found = found_counter.get(type_, 0)
+            right = right_counter.get(type_, 0)
+            recall, precision, f1 = self.compute(origin, found, right)
+            class_info[type_] = {"acc": round(precision, 4), 'recall': round(recall, 4), 'f1': round(f1, 4)}
+        origin = len(self.origins)
+        found = len(self.founds)
+        right = len(self.rights)
+        recall, precision, f1 = self.compute(origin, found, right)
+        return {'acc': precision, 'recall': recall, 'f1': f1}, class_info
+
+    def update(self, true_subject, pred_subject):
+        self.origins.extend(true_subject)
+        self.founds.extend(pred_subject)
+        self.rights.extend([pre_entity for pre_entity in pred_subject if pre_entity in true_subject])
+
+class SeqEntityScore(object):
+    def __init__(self, id2label, markup='bios', middle_prefix='I-'):
+        self.id2label = id2label
+        self.markup = markup
+        self.middle_prefix = middle_prefix
+        self.reset()
+
+    def reset(self):
+        self.origins = []
+        self.founds = []
+        self.rights = []
+
+    def compute(self, origin, found, right):
+        recall = 0 if origin == 0 else (right / origin)
+        precision = 0 if found == 0 else (right / found)
+        f1 = 0. if recall + precision == 0 else (2 * precision * recall) / (precision + recall)
+        return recall, precision, f1
+
+    def result(self):
+        class_info = {}
+        origin_counter = Counter([x[0] for x in self.origins])
+        found_counter = Counter([x[0] for x in self.founds])
+        right_counter = Counter([x[0] for x in self.rights])
+        for type_, count in origin_counter.items():
+            origin = count
+            found = found_counter.get(type_, 0)
+            right = right_counter.get(type_, 0)
+            # print('origin:', origin, ' found:', found, ' right:', right)
+            recall, precision, f1 = self.compute(origin, found, right)
+            class_info[type_] = {"acc": round(precision, 4), 'recall': round(recall, 4), 'f1': round(f1, 4)}
+        origin = len(self.origins)
+        found = len(self.founds)
+        right = len(self.rights)
+        recall, precision, f1 = self.compute(origin, found, right)
+        return {'acc': precision, 'recall': recall, 'f1': f1}, class_info
+
+    def update(self, label_paths, pred_paths):
+        '''
+        labels_paths: [[],[],[],....]
+        pred_paths: [[],[],[],.....]
+
+        :param label_paths:
+        :param pred_paths:
+        :return:
+        Example:
+            >>> labels_paths = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
+            >>> pred_paths = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
+        '''
+        for label_path, pre_path in zip(label_paths, pred_paths):
+            label_entities = get_entities(label_path, self.id2label, self.markup, self.middle_prefix)
+            pre_entities = get_entities(pre_path, self.id2label, self.markup, self.middle_prefix)
+            # print('label:', label_path, ',label_entities: ', label_entities)
+            # print('pred:', pre_path, ',pre_entities: ', pre_entities)
+            self.origins.extend(label_entities)
+            self.founds.extend(pre_entities)
+            self.rights.extend([pre_entity for pre_entity in pre_entities if pre_entity in label_entities])
diff --git a/fengshen/metric/utils_ner.py b/fengshen/metric/utils_ner.py
new file mode 100644
index 0000000000000000000000000000000000000000..20efe33defdcbef59d75e83a1bf993eaadd962c8
--- /dev/null
+++ b/fengshen/metric/utils_ner.py
@@ -0,0 +1,261 @@
+import csv
+import json
+import torch
+from transformers import BertTokenizer
+
+
+class CNerTokenizer(BertTokenizer):
+    def __init__(self, vocab_file, do_lower_case=True):
+        super().__init__(vocab_file=str(vocab_file), do_lower_case=do_lower_case)
+        self.vocab_file = str(vocab_file)
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        _tokens = []
+        for c in text:
+            if self.do_lower_case:
+                c = c.lower()
+            if c in self.vocab:
+                _tokens.append(c)
+            else:
+                _tokens.append('[UNK]')
+        return _tokens
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+    @classmethod
+    def _read_text(self, input_file):
+        lines = []
+        with open(input_file, 'r') as f:
+            words = []
+            labels = []
+            for line in f:
+                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                    if words:
+                        lines.append({"words": words, "labels": labels})
+                        words = []
+                        labels = []
+                else:
+                    splits = line.split(" ")
+                    words.append(splits[0])
+                    if len(splits) > 1:
+                        labels.append(splits[-1].replace("\n", ""))
+                    else:
+                        # Examples could have no label for mode = "test"
+                        labels.append("O")
+            if words:
+                lines.append({"words": words, "labels": labels})
+        return lines
+
+    @classmethod
+    def _read_json(self, input_file):
+        lines = []
+        with open(input_file, 'r', encoding='utf8') as f:
+            for line in f:
+                line = json.loads(line.strip())
+                text = line['text']
+                label_entities = line.get('label', None)
+                words = list(text)
+                labels = ['O'] * len(words)
+                if label_entities is not None:
+                    for key, value in label_entities.items():
+                        for sub_name, sub_index in value.items():
+                            for start_index, end_index in sub_index:
+                                assert ''.join(words[start_index:end_index+1]) == sub_name
+                                if start_index == end_index:
+                                    labels[start_index] = 'S-'+key
+                                else:
+                                    if end_index - start_index == 1:
+                                        labels[start_index] = 'B-' + key
+                                        labels[end_index] = 'E-' + key
+                                    else:
+                                        labels[start_index] = 'B-' + key
+                                        labels[start_index + 1:end_index] = ['I-' + key] * (len(sub_name) - 2)
+                                        labels[end_index] = 'E-' + key
+                lines.append({"words": words, "labels": labels})
+        return lines
+
+
+def get_entity_bios(seq, id2label, middle_prefix='I-'):
+    """Gets entities from sequence.
+    note: BIOS
+    Args:
+        seq (list): sequence of labels.
+    Returns:
+        list: list of (chunk_type, chunk_start, chunk_end).
+    Example:
+        # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC']
+        # >>> get_entity_bios(seq)
+        [['PER', 0,1], ['LOC', 3, 3]]
+    """
+    chunks = []
+    chunk = [-1, -1, -1]
+    for indx, tag in enumerate(seq):
+        if not isinstance(tag, str):
+            tag = id2label[tag]
+        if tag.startswith("S-"):
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+            chunk[1] = indx
+            chunk[2] = indx
+            chunk[0] = tag.split('-')[1]
+            chunks.append(chunk)
+            chunk = (-1, -1, -1)
+        if tag.startswith("B-"):
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+            chunk[1] = indx
+            chunk[0] = tag.split('-')[1]
+        elif tag.startswith(middle_prefix) and chunk[1] != -1:
+            _type = tag.split('-')[1]
+            if _type == chunk[0]:
+                chunk[2] = indx
+            if indx == len(seq) - 1:
+                chunks.append(chunk)
+        else:
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+    return chunks
+
+
+def get_entity_bio(seq, id2label, middle_prefix='I-'):
+    """Gets entities from sequence.
+    note: BIO
+    Args:
+        seq (list): sequence of labels.
+    Returns:
+        list: list of (chunk_type, chunk_start, chunk_end).
+    Example:
+        seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
+        get_entity_bio(seq)
+        #output
+        [['PER', 0,1], ['LOC', 3, 3]]
+    """
+    chunks = []
+    chunk = [-1, -1, -1]
+    for indx, tag in enumerate(seq):
+        if not isinstance(tag, str):
+            tag = id2label[tag]
+        if tag.startswith("B-"):
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+            chunk[1] = indx
+            chunk[0] = tag.split('-')[1]
+            chunk[2] = indx
+            if indx == len(seq) - 1:
+                chunks.append(chunk)
+        elif tag.startswith(middle_prefix) and chunk[1] != -1:
+            _type = tag.split('-')[1]
+            if _type == chunk[0]:
+                chunk[2] = indx
+
+            if indx == len(seq) - 1:
+                chunks.append(chunk)
+        else:
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+    return chunks
+
+
+def get_entity_bioes(seq, id2label, middle_prefix='I-'):
+    """Gets entities from sequence.
+    note: BIOS
+    Args:
+        seq (list): sequence of labels.
+    Returns:
+        list: list of (chunk_type, chunk_start, chunk_end).
+    Example:
+        # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC']
+        # >>> get_entity_bios(seq)
+        [['PER', 0,1], ['LOC', 3, 3]]
+    """
+    chunks = []
+    chunk = [-1, -1, -1]
+    for indx, tag in enumerate(seq):
+        if not isinstance(tag, str):
+            tag = id2label[tag]
+        if tag.startswith("S-"):
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+            chunk[1] = indx
+            chunk[2] = indx
+            chunk[0] = tag.split('-')[1]
+            chunks.append(chunk)
+            chunk = (-1, -1, -1)
+        if tag.startswith("B-"):
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+            chunk[1] = indx
+            chunk[0] = tag.split('-')[1]
+        elif (tag.startswith(middle_prefix) or tag.startswith("E-")) and chunk[1] != -1:
+            _type = tag.split('-')[1]
+            if _type == chunk[0]:
+                chunk[2] = indx
+            if indx == len(seq) - 1:
+                chunks.append(chunk)
+        else:
+            if chunk[2] != -1:
+                chunks.append(chunk)
+            chunk = [-1, -1, -1]
+    return chunks
+
+
+def get_entities(seq, id2label, markup='bio', middle_prefix='I-'):
+    '''
+    :param seq:
+    :param id2label:
+    :param markup:
+    :return:
+    '''
+    assert markup in ['bio', 'bios', 'bioes']
+    if markup == 'bio':
+        return get_entity_bio(seq, id2label, middle_prefix)
+    elif markup == 'bios':
+        return get_entity_bios(seq, id2label, middle_prefix)
+    else:
+        return get_entity_bioes(seq, id2label, middle_prefix)
+
+
+def bert_extract_item(start_logits, end_logits):
+    S = []
+    start_pred = torch.argmax(start_logits, -1).cpu().numpy()[0][1:-1]
+    end_pred = torch.argmax(end_logits, -1).cpu().numpy()[0][1:-1]
+    for i, s_l in enumerate(start_pred):
+        if s_l == 0:
+            continue
+        for j, e_l in enumerate(end_pred[i:]):
+            if s_l == e_l:
+                S.append((s_l, i, i + j))
+                break
+    return S
diff --git a/fengshen/models/DAVAE/BertForLatentConnector.py b/fengshen/models/DAVAE/BertForLatentConnector.py
new file mode 100644
index 0000000000000000000000000000000000000000..08dffce16874a4b263fb604380e5490645cb483e
--- /dev/null
+++ b/fengshen/models/DAVAE/BertForLatentConnector.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import pdb
+
+import torch
+from torch import nn
+from transformers import BertConfig,BertPreTrainedModel
+from transformers.models.bert.modeling_bert import BertEmbeddings,BertEncoder,BertPooler
+
+
+class BertForLatentConnector(BertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, latent_size):
+        super(BertForLatentConnector, self).__init__(config)
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+
+        self.linear = nn.Linear(config.hidden_size, 2 * latent_size, bias=False)
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, emb_noise=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        
+        if emb_noise is not None:
+            embedding_output = embedding_output + emb_noise(embedding_output).to(embedding_output.dtype)
+
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
diff --git a/fengshen/models/DAVAE/DAVAEModel.py b/fengshen/models/DAVAE/DAVAEModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..24261832e029417651b6e61738b391bfc244b8b1
--- /dev/null
+++ b/fengshen/models/DAVAE/DAVAEModel.py
@@ -0,0 +1,235 @@
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BertConfig,TransfoXLConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import cached_path,hf_bucket_url
+from fengshen.models.DAVAE.GPT2ModelForLatent import GPT2ModelForLatent
+from fengshen.models.DAVAE.BertForLatentConnector import BertForLatentConnector
+from fengshen.models.DAVAE.run_latent_generation import *
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def log_sum_exp(value, dim=None, keepdim=False):
+    """Numerically stable implementation of the operation
+    value.exp().sum(dim, keepdim).log()
+    """
+    if dim is not None:
+        m, _ = torch.max(value, dim=dim, keepdim=True)
+        value0 = value - m
+        if keepdim is False:
+            m = m.squeeze(dim)
+        return m + torch.log(torch.sum(torch.exp(value0), dim=dim, keepdim=keepdim))
+    else:
+        m = torch.max(value)
+        sum_exp = torch.sum(torch.exp(value - m))
+        return m + torch.log(sum_exp)
+
+class VAEPretrainedModel(PreTrainedModel):
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        pass  # to bypass the not implement error
+
+class DAVAEModel(VAEPretrainedModel):
+    config_class = PretrainedConfig
+    def __init__(self, config:PretrainedConfig,*model_args, **model_kwargs):
+        super().__init__(config=config)
+        self.config = config
+        self.vae_model = DAVAEModel.load_model(self.config)
+
+    @classmethod
+    def load_model(cls, config):
+        encoder_config = BertConfig.from_dict(config.encoder)
+        encoder_model = BertForLatentConnector(config=encoder_config, latent_size=config.latent_size)
+        dec_config = TransfoXLConfig.from_dict(config.decoder)
+        dec_config.latent_size = config.latent_size
+        decoder_model = GPT2ModelForLatent(config=dec_config)
+        vae_model = EncDecAAE(config,encoder_model, decoder_model, dec_config.latent_size, pad_token_id=50000)
+        return vae_model
+
+    def set_tokenizers(self,encoder_tokenizer,decoder_tokenizer):
+        if not hasattr(self, 'encoder_tokenizer'):
+            self.encoder_tokenizer = encoder_tokenizer
+        if not hasattr(self, 'decoder_tokenizer'):
+            self.decoder_tokenizer = decoder_tokenizer
+            
+    def simulate_batch(self,encoder_tokenizer,decoder_tokenizer, sent_inputs, prompt=None):
+        self.set_tokenizers(encoder_tokenizer,decoder_tokenizer)
+        # 生成相似句
+        latent_z = self.latent_code_from_text_batch(sent_inputs)
+        text_analogy = self.text_from_latent_code_batch(latent_z,prompt=prompt)
+        return text_analogy
+    
+    def latent_code_from_text_batch(self,texts):
+        # texts->latents
+        tokens_tensor_list = []
+        for text in texts:
+            tokens = self.encoder_tokenizer.encode(text)[:510]
+            tokens_tensor_list.append(torch.tensor([101]+tokens+[102]))
+
+        coded = pad_sequence(tokens_tensor_list, batch_first=True, padding_value=0).long()
+        device = next(self.vae_model.decoder.parameters()).device
+        with torch.no_grad():
+            coded = coded.to(device)
+            pooled_hidden_fea = self.vae_model.encoder(coded, attention_mask=(coded > 0).float())[1]
+            mean, logvar = self.vae_model.encoder.linear(pooled_hidden_fea).chunk(2, -1)
+
+            std = logvar.mul(0.5).exp()
+            eps = torch.zeros_like(std).normal_()
+
+            latent_z = mean + torch.mul(eps, std)*self.config.std_scale
+            return latent_z
+    def text_from_latent_code_batch(self,latent_z, prompt=None):
+        # latents->texts
+        device = next(self.vae_model.decoder.parameters()).device
+        past = latent_z
+        batch_size = latent_z.shape[0]
+        bos_token = self.decoder_tokenizer.convert_tokens_to_ids(self.decoder_tokenizer.bos_token)
+        end_token = self.decoder_tokenizer.convert_tokens_to_ids(self.decoder_tokenizer.eos_token)
+
+        if prompt is not None:
+            prompt = [[bos_token] + self.decoder_tokenizer.encode(text)[:-1] for text in prompt]
+        else:
+            prompt = [[bos_token]]*batch_size
+
+        context_tokens_tensor = torch.tensor([[end_token]*self.config.max_out_length]*batch_size).to(device) # 2-d tensor
+        context_length_tensor = torch.tensor([1]*batch_size).to(device)
+        for i in range(batch_size):
+            context_tokens_tensor[i,:len(prompt[i])] = torch.tensor(prompt[i]).long().to(device)
+            context_length_tensor[i] = len(prompt[i])
+
+        out = sample_sequence_conditional_batch(
+            model=self.vae_model.decoder,
+            max_out_length= self.config.max_out_length, 
+            context_tokens_tensor=context_tokens_tensor,
+            context_length_tensor=context_length_tensor,
+            latent_z=latent_z,
+            temperature=self.config.temperature,
+            top_k=self.config.top_k,
+            top_p=self.config.top_p,
+            repetition_penalty=self.config.repetition_penalty,
+            device=device
+        )
+
+        out_text = []
+        for i, tokens in enumerate(out):
+            tokens = tokens[len(prompt[i]):]
+            tokens = tokens[:tokens.index(end_token)] if end_token in tokens else tokens
+            text = self.decoder_tokenizer.decode(tokens, clean_up_tokenization_spaces=True)
+            out_text.append(filter_noise(text))
+        return out_text
+class EncDecAAE(nn.Module):
+    """Adversarial Auto-Encoder"""
+    def __init__(self,config, encoder, decoder, latent_size, pad_token_id):
+        super(EncDecAAE, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.config = config
+        self.pad_token_id = pad_token_id
+        self.Disc = nn.Sequential(nn.Linear(latent_size, 4*latent_size), nn.ReLU(),
+                               nn.Linear(4*latent_size, 1))
+        # Standard Normal prior
+        loc = torch.zeros(latent_size)
+        scale = torch.ones(latent_size)
+        self.prior = torch.distributions.normal.Normal(loc, scale)
+
+    def connect(self, bert_fea, nsamples=1, fb_mode=0):
+        """
+        Returns: Tensor1, Tensor2
+            Tensor1: the tensor latent z with shape [batch, nsamples, nz]
+            Tensor2: the tenor of KL for each x with shape [batch]
+        """
+        # (batch_size, nz)
+
+        mean, logvar = self.encoder.linear(bert_fea).chunk(2, -1)
+        z = self.reparameterize(mean, logvar, nsamples)
+        if fb_mode == 0:
+            KL = 0.5 * (mean.pow(2) + logvar.exp() - logvar - 1).sum(dim=1)
+        elif fb_mode == 1:
+            kl_loss = 0.5 * (mean.pow(2) + logvar.exp() - logvar - 1)
+            kl_mask = (kl_loss > self.config.dim_target_kl).float()
+            KL = (kl_mask * kl_loss).sum(dim=1)
+
+        return z, KL
+
+    def connect_deterministic(self, bert_fea, nsamples=1):
+        """
+        Returns: Tensor1, Tensor2
+            Tensor1: the tensor latent z with shape [batch, nsamples, nz]
+            Tensor2: the tenor of KL for each x with shape [batch]
+        """
+
+        # (batch_size, nz)
+
+        mean, logvar = self.encoder.linear(bert_fea).chunk(2, -1)
+        logvar = torch.zeros_like(logvar)
+        z = self.reparameterize(mean, logvar, nsamples)
+        KL = 0.5 * (mean.pow(2) + logvar.exp() - logvar - 1).sum(dim=1)
+
+        return z, KL
+
+    def reparameterize(self, mu, logvar, nsamples=1):
+        """sample from posterior Gaussian family
+        Args:
+            mu: Tensor
+                Mean of gaussian distribution with shape (batch, nz)
+            logvar: Tensor
+                logvar of gaussian distibution with shape (batch, nz)
+        Returns: Tensor
+            Sampled z with shape (batch, nsamples, nz)
+        """
+        batch_size, nz = mu.size()
+        std = logvar.mul(0.5).exp()
+
+        mu_expd = mu.unsqueeze(1).expand(batch_size, nsamples, nz)
+        std_expd = std.unsqueeze(1).expand(batch_size, nsamples, nz)
+
+        eps = torch.zeros_like(std_expd).normal_()
+
+        return mu_expd + torch.mul(eps, std_expd)
+
+    def loss_adv(self, z):
+        zn = torch.randn_like(z)
+        zeros = torch.zeros(len(z), 1, device=z.device).half()
+        ones = torch.ones(len(z), 1, device=z.device).half()
+
+        loss_d = F.binary_cross_entropy_with_logits(self.Disc(z.detach().half()), zeros) + \
+        F.binary_cross_entropy_with_logits(self.Disc(zn.half()), ones)
+        loss_g = F.binary_cross_entropy_with_logits(self.Disc(z.half()), ones)
+        return loss_d, loss_g
+
+    def forward(self, inputs, labels, beta=0.0, iw=None, fb_mode=0, emb_noise=None):
+        attention_mask = (inputs > 0).float()
+        reconstrution_mask = (labels != self.pad_token_id).float() # the padding token for GPT2
+        sent_length = torch.sum(reconstrution_mask, dim=1)
+
+        outputs = self.encoder(inputs, attention_mask, emb_noise=emb_noise)
+        pooled_hidden_fea = outputs[1]
+
+        seq_length = labels.size(1)
+        dec_attn_mask = self.decoder.get_attn_mask(seq_length).to(labels.device)
+
+        if fb_mode in [0,1]:
+            latent_z, loss_kl = self.connect(pooled_hidden_fea, fb_mode=fb_mode)
+            latent_z = latent_z.squeeze(1)
+            outputs = self.decoder(input_ids=labels, attention_mask=dec_attn_mask, latent_state=latent_z, labels=labels, label_ignore=self.pad_token_id) # ignore loss over padding tokens
+            loss_rec = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+        elif fb_mode==2: 
+            latent_z, loss_kl = self.connect_deterministic(pooled_hidden_fea)
+            latent_z = latent_z.squeeze(1)
+            outputs = self.decoder(input_ids=labels, attention_mask=dec_attn_mask, latent_state=latent_z, labels=labels, label_ignore=self.pad_token_id)
+            loss_rec = outputs[0]  # model outputs are always tuple
+
+        if self.config.length_weighted_loss:
+            loss = loss_rec / sent_length + beta * loss_kl
+        else:
+            loss = loss_rec + beta * loss_kl
+
+        if iw!=None:
+            total_loss = torch.sum(loss*iw)/torch.sum(iw)
+        else:
+            total_loss = torch.sum(loss)
+        return (loss_rec/sent_length).mean(), loss_kl.mean(), total_loss
+
diff --git a/fengshen/models/DAVAE/GPT2ModelForLatent.py b/fengshen/models/DAVAE/GPT2ModelForLatent.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d5f50a73d26bf38d2fcf7d2620ce3d8aa547af
--- /dev/null
+++ b/fengshen/models/DAVAE/GPT2ModelForLatent.py
@@ -0,0 +1,640 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT-2 model."""
+
+import torch
+import torch.nn.functional as F
+import math
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+# from ......configuration_transfo_xl import TransfoXLConfig 
+from transformers import TransfoXLConfig
+
+from transformers.modeling_utils import (
+    PreTrainedModel
+)
+
+
+class PositionalEmbedding(torch.nn.Module):
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+@torch.jit.script
+def gelu_impl(x):
+     """OpenAI's gelu implementation."""
+     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                        (1.0 + 0.044715 * x * x)))
+
+def gelu(x):
+    return gelu_impl(x)
+
+class GPT2SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None, relative_encoding=False):
+        super(GPT2SelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = hidden_size
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = num_attention_heads
+        self.relative_encoding = relative_encoding
+        # Strided linear layer.
+        self.query_key_value = torch.nn.Linear(hidden_size, 3*hidden_size, bias=True)
+
+        if relative_encoding:
+            self.relative = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    @staticmethod
+    def _rel_shift(x, zero_triu=False):
+        # ql x kl x bsz x h
+        # bsz x h x ql x kl
+        zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
+                               device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
+
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
+
+        return x
+
+    @staticmethod
+    def _rel_shift_latest(x: torch.Tensor):
+        ndims = x.dim()
+        x_shape = x.size()
+        row_dim = 2
+        col_dim = row_dim + 1
+        assert col_dim < ndims
+        tgt_shape_1, tgt_shape_2 = [], []
+        for i in range(ndims):
+            if i == row_dim:
+                tgt_shape_1.append(x_shape[col_dim])
+                tgt_shape_2.append(x_shape[row_dim])
+            elif i == col_dim:
+                tgt_shape_1.append(x_shape[row_dim])
+                tgt_shape_2.append(x_shape[col_dim] - 1)
+            else:
+                tgt_shape_1.append(x_shape[i])
+                tgt_shape_2.append(x_shape[i])
+        x = x.view(*tgt_shape_1)
+        x = x[:, :, 1:, :]
+        x = x.view(*tgt_shape_2)
+        return x
+
+    def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = torch.chunk(mixed_x_layer, 3, dim=-1)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = torch.chunk(mixed_x_layer, 3, dim=-1)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.relative_encoding:
+            relative_layer = self.relative(position_embeddings)
+            relative_layer = self._transpose_for_scores(relative_layer)  # 1 (bsz) x n_head x klen x d_head
+            # Raw attention scores. [b, np, qs, ks]
+            rw_head_q = query_layer + r_w_bias.unsqueeze(1)
+            ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
+            rr_head_q = query_layer + r_r_bias.unsqueeze(1)
+            bd_score = torch.matmul(rr_head_q, relative_layer.transpose(-1, -2))
+            bd_score = self._rel_shift(bd_score)  # qlen x klen x bsz x n_head
+            # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
+
+            attention_scores = ac_score + bd_score
+        else:
+            # Raw attention scores. [b, np, s, s]
+            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask) - \
+                           10000.0 * (1.0 - ltor_mask)
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # with get_cuda_rng_tracker().fork():
+        #     attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        # print(f'attn_probs {attention_probs}, value_layer {value_layer}')
+        context_layer = torch.matmul(attention_probs, value_layer.float())
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+class GPT2MLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self, hidden_size, output_dropout_prob, init_method,
+                 output_layer_init_method=None):
+        super(GPT2MLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = torch.nn.Linear(hidden_size, 4*hidden_size)
+        # Project back to h.
+        self.dense_4h_to_h = torch.nn.Linear(4*hidden_size, hidden_size)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+
+
+class GPT2TransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False):
+        super(GPT2TransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = GPT2SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = torch.nn.LayerNorm(hidden_size,
+                                                  eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GPT2MLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask, position_embeddings, r_w_bias, r_r_bias, mem)
+        # Residual connection.
+        # print(f'hz {hidden_states.shape}, attn {attention_output.shape}')
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+class GPT2TransformerForLatent(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+    def __init__(self,
+                 num_layers,
+                 hidden_size,
+                 num_attention_heads,
+                 max_sequence_length,
+                 max_memory_length,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 checkpoint_activations,
+                 latent_size = 64,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 use_scaled_init_for_output_weights=True,
+                 relative_encoding=False):
+        super(GPT2TransformerForLatent, self).__init__()
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+
+        self.latent_size = latent_size
+        # self.linear = nn.Linear(self.latent_size, hidden_size * num_layers, bias=False).float() # different latent vector for each layer 
+        # self.linear_emb = nn.Linear(self.latent_size, hidden_size * num_layers, bias=False).float()
+        self.linear_emb = nn.Linear(self.latent_size, hidden_size, bias=False).float()
+        
+        # torch.nn.init.normal_(self.linear.weight, mean=0.0, std=init_method_std)
+        torch.nn.init.normal_(self.linear_emb.weight, mean=0.0, std=init_method_std)
+
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(init_method_std,
+                                                      num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.relative_encoding = relative_encoding
+        if relative_encoding:
+            # Relative position embedding
+            self.position_embeddings = PositionalEmbedding(hidden_size)
+            # Per attention head and per partition values.
+            self.hidden_size_per_attention_head = divide(hidden_size,
+                                                         num_attention_heads)
+            self.num_attention_heads_per_partition = num_attention_heads
+            self.r_w_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+            self.r_r_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.r_w_bias.zero_()
+                self.r_r_bias.zero_()
+        else:
+            # Position embedding (serial).
+            self.position_embeddings = torch.nn.Embedding(max_sequence_length,
+                                                          hidden_size)
+            # Initialize the position embeddings.
+            torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+
+        def get_layer():
+            return GPT2TransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                relative_encoding=relative_encoding)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+
+    def forward(self, hidden_states, attention_mask, latent_state, mems):
+        batch_size, query_length, hidden_size = hidden_states.size()
+        # memory_length = self.latent_size
+        memory_length = mems[0].size(1) if mems else 0
+
+        # key_length = query_length + memory_length+1
+        # attention_mask = attention_mask[:, :, :, -query_length-memory_length-1:]
+        key_length = query_length + memory_length
+        attention_mask = attention_mask[:, :, :, -query_length - memory_length:]
+
+        if latent_state is not None: 
+            latent_emb = self.linear_emb(latent_state)
+            # latent_emb = torch.split(latent_emb.unsqueeze(1), hidden_size, dim=2)
+            latent_emb = latent_emb.unsqueeze(1)
+        # print(f'latent_state {latent_state.half()}\n linear_emb {self.linear_emb.weight} \n latent_emb {latent_emb}')
+        # torch.save(latent_state, '/cognitive_comp/wanghao/experiments/fengshen/latent_state.pt')
+        # torch.save(self.linear_emb, '/cognitive_comp/wanghao/experiments/fengshen/weight.pt')
+
+
+        position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device,
+                                         dtype=hidden_states.dtype)
+        position_embeddings = self.position_embeddings(position_sequence)
+
+        # print(f'pos {position_embeddings.shape}, latent {latent_emb.shape}')
+        # if latent_state is not None:
+        #     position_embeddings += latent_emb.unsqueeze(0)
+        # Apply dropout
+        position_embeddings = self.embedding_dropout(position_embeddings)
+
+        # print(f'latent_emb {latent_emb.shape}, {hidden_states.shape}')
+        if latent_state is not None:
+            hidden_states = hidden_states + latent_emb 
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        # latent_mem = self.linear(latent_state.half())
+        # latent_mem = torch.split(latent_mem.unsqueeze(1), hidden_size, dim=2)
+
+        if self.max_memory_length > 0:
+            mem_layers = [hidden_states.detach()]
+        else:
+            mem_layers = []
+
+        for i, layer in enumerate(self.layers):
+            args = [hidden_states, attention_mask]
+            if self.relative_encoding:
+                args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+
+            mem_i = mems[i] if mems else None
+            # print(f'mems {len(mems)} {mems[0].shape}')
+            # mem_i = torch.cat((latent_mem[i], mems[i]), 1) if mems else latent_mem[i]
+            # print(f'mem_i {mem_i.shape}, {mem_i}')
+            hidden_states = layer(*args, mem=mem_i)
+
+            if latent_state is not None:
+                hidden_states = hidden_states + latent_emb
+
+            if self.max_memory_length > 0:
+                mem_layers.append(hidden_states.detach())
+        # print(f'mem_layers {len(mem_layers)} mems {len(mems)}')
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = self.update_mems(mem_layers, mems)
+
+        return (output, mem_layers)
+
+    def update_mems(self, hiddens, mems):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = min(self.max_memory_length, memory_length + query_length)
+        new_mems = []
+        with torch.no_grad():
+            for i in range(len(hiddens)):
+                if new_memory_length <= query_length:
+                    new_mems.append(hiddens[i][:, -new_memory_length:])
+                else:
+                    new_mems.append(torch.cat((mems[i][:, -new_memory_length+query_length:], hiddens[i]), dim=1))
+        return new_mems
+
+
+class GPT2ModelForLatent(PreTrainedModel):
+    """GPT-2 Language model.
+
+    The output of the forward method are the logits (parallel or
+    serial depending on the `parallel_output` flag.
+    """
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        pass  # to bypass the not implement error 
+
+    def __init__(self, config:TransfoXLConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.word_embeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size)
+
+        # Transformer
+        self.transformer = GPT2TransformerForLatent(config.num_layers,
+                                                    config.hidden_size,
+                                                    config.num_attention_heads,
+                                                    config.max_sequence_length,
+                                                    config.max_memory_length,
+                                                    config.embedding_dropout_prob,
+                                                    config.attention_dropout_prob,
+                                                    config.output_dropout_prob,
+                                                    config.checkpoint_activations,
+                                                    config.latent_size,
+                                                    config.checkpoint_num_layers,
+                                                    relative_encoding=config.relative_encoding)
+
+
+    def forward(self, input_ids, attention_mask, latent_state, mems=None, labels=None, label_ignore=None):
+        embeddings = self.word_embeddings(input_ids)
+
+        # Transformer.
+        logits, hidden_layers = self.transformer(embeddings, attention_mask, latent_state, mems)
+        lm_logits = F.linear(logits,
+                            self.word_embeddings.weight)
+
+        outputs = (lm_logits, hidden_layers) # (bz, sql, vocab), ()
+        if labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+
+            loss_fct = CrossEntropyLoss(ignore_index=label_ignore, reduce=False)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            loss = torch.sum(loss.view(-1, shift_labels.shape[-1]), -1)
+            outputs = (loss,) + outputs
+
+        return outputs
+
+    def get_attn_mask(self, seq_length):
+        # mem_length = self.config.max_memory_length + 1
+        mem_length = self.config.max_memory_length
+        attention_mask = torch.ones((1, seq_length, seq_length + mem_length))
+        attention_mask = torch.tril(torch.triu(attention_mask, 1 - seq_length + mem_length), mem_length)
+        attention_mask = attention_mask.unsqueeze(1)
+        return attention_mask
diff --git a/fengshen/models/DAVAE/__init__.py b/fengshen/models/DAVAE/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae84b6461f88023821149c4d8a994cfc24e6f38c
--- /dev/null
+++ b/fengshen/models/DAVAE/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DAVAE model. """
diff --git a/fengshen/models/DAVAE/run_latent_generation.py b/fengshen/models/DAVAE/run_latent_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9f099d205279d883df589fe5031ff0fdbcfb32d
--- /dev/null
+++ b/fengshen/models/DAVAE/run_latent_generation.py
@@ -0,0 +1,302 @@
+import re
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+import json
+import jsonlines
+from tqdm import tqdm, trange
+
+def set_seed(args):
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def filter_noise(text):
+    space_pattern = '([\u4e00-\u9fa5|0-9|，|。|？|！|@|¥|……|——|《|》|“|”|、|；|：|‘|’|（|）|「|」|【|】|·|～|-|+])\s+([\u4e00-\u9fa5|0-9|，|。|？|！|@|¥|……|——|《|》|“|”|、|；|：|‘|’|（|）|「|」|【|】|·|～|-|+])'
+    text = re.sub(space_pattern, r'\1\2', text)
+    text = re.sub(space_pattern, r'\1\2', text)
+    patterns = ['引用日期.*$', '参考资料.*$', '\[.*\]', '【.*】', '原文地址：', '原文转载：', '本文转自：', '本文摘要：', '<unk>']
+    for pattern in patterns:
+        text = re.sub(pattern, "", text)
+    return text.strip()
+
+def get_raw_data(raw_data):
+    train_data = {}
+    with open(raw_data, 'r', encoding='utf8') as fh:
+        for line in fh:
+            line = json.loads(line)
+            for key in line.keys():
+                if key not in train_data.keys():
+                    train_data[key] = [line[key]]
+                else:
+                    train_data[key].append(line[key])
+    return train_data
+
+def save_output(input_text, output, output_file):
+    with jsonlines.open(output_file, mode='a') as writer:
+        for text_in,text_out in zip(input_text, output):
+            otc = {}
+            otc['text_a'] = str(text_in)
+            otc['text_b'] = str(text_out)
+            writer.write(otc)
+
+def enforce_repetition_penalty(lprobs, prev_output_tokens, repetition_penalty = 1.5):
+    """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+    for i in range(len(prev_output_tokens)):
+        for previous_token in set(prev_output_tokens[i]):
+            # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+            if lprobs[i, previous_token] < 0:
+                lprobs[i, previous_token] *= repetition_penalty
+            else:
+                lprobs[i, previous_token] /= repetition_penalty
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    # assert logits.dim() == 1# batch size 1 for now - could be updated for more but the code would be less clear
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        for i in range(sorted_indices.size()[0]):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        # logits[indices_to_remove] = filter_value
+    return logits
+
+def sample_sequence_conditional(model, length, context, latent_z=None, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device='cpu'):
+
+    context = torch.tensor(context, dtype=torch.long, device=device)
+    context = context.unsqueeze(0)
+    generated = context
+    with torch.no_grad():
+        for i in trange(length):
+            if i == 2:
+                generated[generated[:, 1] == 127, 1] = 0 
+            attention_mask = model.get_attn_mask(generated.shape[1]).to(device)
+            inputs = {'input_ids': generated, 'latent_state': latent_z, 'attention_mask':attention_mask, 'mems':None}
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            next_token_logits = outputs[0][:, -1, :] / temperature
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+
+            log_probs = F.softmax(filtered_logits, dim=-1)
+            if repetition_penalty != 1.0:
+                enforce_repetition_penalty(log_probs, generated, repetition_penalty)
+            next_token = torch.multinomial(log_probs, num_samples=1)
+            generated = torch.cat((generated, next_token), dim=1)
+            # pdb.set_trace()
+            # if next_token[0,0].item() == decoder_tokenizer.encode('<EOS>')[0]:
+            if next_token[0, 0] == 50000: # end of token 50000
+                break
+
+    return generated
+
+def latent_code_from_text(text, tokenizer_encoder, model_vae, args, scale=1.0):
+    tokenized1 = tokenizer_encoder.encode(text)
+    coded = torch.Tensor([tokenized1]).long()
+    with torch.no_grad():
+        coded = coded.to(device)
+        outputs = model_vae.encoder(coded, attention_mask=(coded > 0).float())
+        pooled_hidden_fea = outputs[1]
+
+        mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1)
+        std = logvar.mul(0.5).exp()
+        eps = torch.zeros_like(std).normal_()
+
+        return mean + torch.mul(eps, std)*scale
+
+def text_from_latent_code(latent_z, model_vae, args, tokenizer_decoder, prompt=None):
+    bos_token = tokenizer_decoder.convert_tokens_to_ids(tokenizer_decoder.bos_token)
+    context_tokens = [bos_token]
+
+    if prompt is not None:
+        context_tokens.append(tokenizer_decoder.encode(prompt)[:-1]) # remove eos token
+
+    out = sample_sequence_conditional(
+        model=model_vae.decoder,
+        context=context_tokens,
+        latent_z=latent_z,
+        length= args.max_out_length, # Chunyuan: Fix length; or use <EOS> to complete a sentence
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        device=device
+    )
+
+    out_tokens = out[0, :].tolist()
+    out_tokens = out_tokens[1:out_tokens.index(50000)] if 50000 in out_tokens else out_tokens # remove bos and eos
+    text_x1 = tokenizer_decoder.decode(out_tokens, clean_up_tokenization_spaces=True)
+
+    return text_x1
+
+
+def simulate(model_vae, tokenizer_encoder, tokenizer_decoder, args, sent_input, prompt=None):
+    latent_z, _ = latent_code_from_text(sent_input, tokenizer_encoder, model_vae, args)
+    text_analogy = text_from_latent_code(latent_z, model_vae, args, tokenizer_decoder, prompt=prompt)
+    
+    return text_analogy
+
+def switch(next_value, init, is_update):
+    is_update = is_update.type_as(next_value)
+    return (1-is_update)*init + is_update*next_value
+
+def sample_sequence_conditional_batch(model, max_out_length, context_tokens_tensor, context_length_tensor, latent_z=None, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device='cpu', end_token=50000):
+    org_context_length = torch.min(context_length_tensor).item()
+    batch_size = context_tokens_tensor.shape[0]
+
+    generated = context_tokens_tensor[:,:org_context_length]
+    counter = org_context_length
+
+    output_tokens_lists = []
+    output_order = []
+    orig_order = torch.LongTensor(list(range(batch_size)))
+
+    with torch.no_grad():
+        while counter < max_out_length:
+            if counter == org_context_length+2:
+                generated[generated[:,org_context_length] == 127, org_context_length] = 0 
+            attention_mask = model.get_attn_mask(generated.shape[1]).to(device)
+            inputs = {'input_ids': generated, 'latent_state': latent_z, 'attention_mask': attention_mask}
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            next_token_logits = outputs[0][:, -1, :] / temperature
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+
+            # if counter == org_context_length:
+            #     filtered_logits[:, 43488] = -float('Inf') # forbid starting with '《'
+            log_probs = F.softmax(filtered_logits, dim=-1)
+
+            if repetition_penalty != 1.0:
+                enforce_repetition_penalty(log_probs, generated, repetition_penalty)
+
+            if any(log_probs.sum(dim=-1) <= 0.0) :
+                break
+            next_token = torch.multinomial(log_probs, num_samples=1).view(-1)
+            next_token = switch(next_token, context_tokens_tensor[:, counter], context_length_tensor<=counter)
+
+            if torch.all(next_token == end_token).item():
+                break
+
+            stop_idx = next_token == end_token
+            output_order.extend(orig_order[stop_idx].tolist())
+
+            finished = generated[stop_idx]
+            output_tokens_lists.extend(finished.detach().cpu().tolist())
+            # continue with non-ending tokens
+            conti_idx = next_token != end_token
+            orig_order = orig_order[conti_idx]
+            generated = generated[conti_idx]
+            latent_z = latent_z[conti_idx]
+
+            next_token = next_token[conti_idx]
+            context_tokens_tensor = context_tokens_tensor[conti_idx]
+            context_length_tensor = context_length_tensor[conti_idx]
+            batch_size = generated.shape[0]
+
+            generated = torch.cat((generated, next_token.view(batch_size, 1)), dim=-1)
+            counter += 1
+
+        output_order.extend(orig_order.tolist())
+        generated = generated.detach().cpu().tolist()
+        output_tokens_lists.extend(generated)
+        output_tokens_lists = [tokens[:tokens.index(end_token)] if end_token in tokens else tokens for tokens in output_tokens_lists]
+
+        output_tokens_lists = [tokens for _,tokens in sorted(zip(output_order, output_tokens_lists))]
+
+    return output_tokens_lists
+
+def latent_code_from_text_batch(texts, tokenizer_encoder, model_vae, args):
+    tokens_tensor_list = []
+    for text in texts:
+        tokens = tokenizer_encoder.encode(text)[:510]
+        tokens_tensor_list.append(torch.tensor([101]+tokens+[102]))
+
+    coded = pad_sequence(tokens_tensor_list, batch_first=True, padding_value=0).long()
+    with torch.no_grad():
+        coded = coded.to(device)
+        pooled_hidden_fea = model_vae.encoder(coded, attention_mask=(coded > 0).float())[1]
+        mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1)
+
+        std = logvar.mul(0.5).exp()
+        eps = torch.zeros_like(std).normal_()
+
+        latent_z = mean + torch.mul(eps, std)*args.std_scale
+
+        return latent_z
+
+def text_from_latent_code_batch(latent_z, model_vae, args, tokenizer_decoder, prompt=None):
+    past = latent_z
+    batch_size = latent_z.shape[0]
+    bos_token = tokenizer_decoder.convert_tokens_to_ids(tokenizer_decoder.bos_token)
+    end_token = tokenizer_decoder.convert_tokens_to_ids(tokenizer_decoder.eos_token)
+
+    if prompt is not None:
+        prompt = [[bos_token] + tokenizer_decoder.encode(text)[:-1] for text in prompt]
+    else:
+        prompt = [[bos_token]]*batch_size
+
+    context_tokens_tensor = torch.tensor([[end_token]*args.max_out_length]*batch_size).to(device) # 2-d tensor
+    context_length_tensor = torch.tensor([1]*batch_size).to(device)
+    for i in range(batch_size):
+        context_tokens_tensor[i,:len(prompt[i])] = torch.tensor(prompt[i]).long().to(device)
+        context_length_tensor[i] = len(prompt[i])
+
+    # length = 128 # maximum length, but not used
+    out = sample_sequence_conditional_batch(
+        model=model_vae.decoder,
+        max_out_length= args.max_out_length, # Chunyuan: Fix length; or use <EOS> to complete a sentence
+        context_tokens_tensor=context_tokens_tensor,
+        context_length_tensor=context_length_tensor,
+        latent_z=latent_z,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        device=device
+    )
+
+    out_text = []
+    for i, tokens in enumerate(out):
+        tokens = tokens[len(prompt[i]):]
+        tokens = tokens[:tokens.index(end_token)] if end_token in tokens else tokens
+        text = tokenizer_decoder.decode(tokens, clean_up_tokenization_spaces=True)
+        out_text.append(filter_noise(text))
+    return out_text
+
+
+def simulate_batch(model_vae, tokenizer_encoder, tokenizer_decoder, args, sent_inputs, prompt=None):
+    latent_z = latent_code_from_text_batch(sent_inputs, tokenizer_encoder, model_vae, args)
+    text_analogy = text_from_latent_code_batch(latent_z, model_vae, args, tokenizer_decoder, prompt=prompt)
+    return text_analogy
+
+def simulate_bz(model_vae, tokenizer_encoder, tokenizer_decoder, args, sent_inputs, prompt=None):
+    latent_z = latent_code_from_text_batch(sent_inputs, tokenizer_encoder, model_vae, args)
+    return latent_z
+
+def my_shuffle(x, index):
+    result = []
+    for field in index:
+        result.append(x[field])
+    return result
+
diff --git a/fengshen/models/GAVAE/GAVAEModel.py b/fengshen/models/GAVAE/GAVAEModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa74f95fd775ed17c9e25d9564f94c93b50347f8
--- /dev/null
+++ b/fengshen/models/GAVAE/GAVAEModel.py
@@ -0,0 +1,67 @@
+# -*- encoding: utf-8 -*-
+'''
+Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@File    :   GAVAEModel.py
+@Time    :   2022/11/04 11:35
+@Author  :   Liang Yuxin
+@Version :   1.0
+@Contact :   liangyuxin@idea.edu.cn
+@License :   (C)Copyright 2022-2023, CCNL-IDEA
+'''
+import torch
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+
+from fengshen.models.DAVAE.DAVAEModel import DAVAEModel
+from fengshen.models.GAVAE.gans_model import gans_process
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class GAVAEPretrainedModel(PreTrainedModel):
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        pass  # to bypass the not implement error
+
+class GAVAEModel(GAVAEPretrainedModel):
+    config_class = PretrainedConfig
+    def __init__(self, config:PretrainedConfig) -> None:
+        super().__init__(config)
+        self.config =config
+        config.device = device
+        self.gan = gans_process(self.config)
+        self.vae_model = DAVAEModel(self.config)
+
+    def train_gan(self,encoder_tokenizer,decoder_tokenizer,input_texts):
+        self.vae_model.set_tokenizers(encoder_tokenizer,decoder_tokenizer)
+        n = len(input_texts)
+        inputs_latents = self.vae_model.latent_code_from_text_batch(input_texts)
+        well_trained_gan = False
+        while not well_trained_gan:
+            self.gan_training(inputs_latents)
+            latent = torch.tensor(self.gan.gen_test(n))
+            if not latent.isnan().any():
+                well_trained_gan = True
+
+    def generate(self,n):
+        latent_z = torch.tensor(self.gan.gen_test(n)).to(device)
+        text = self.vae_model.text_from_latent_code_batch(latent_z,prompt=None)
+        return text
+    
+    def gan_training(self,inputs_latents):
+        for gt in range(self.config.gan_epoch):
+            x_train,y_train,x_test,y_test,perm = self.gan.ready_cls(inputs_latents)
+            # sent_output:latent_z inputs_labels:id of class label
+            self.gan.cls_train(x_train, y_train)
+            x2_gen, y_gen, s_gen = self.gan.ready_gen(inputs_latents)
+            # s_gen:sent_output
+            self.gan.gen_train(x2_gen, y_gen, s_gen, gt)
diff --git a/fengshen/models/GAVAE/gans_model.py b/fengshen/models/GAVAE/gans_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5880acf9c36c6dfd41cf6286f25a93501e64e5e5
--- /dev/null
+++ b/fengshen/models/GAVAE/gans_model.py
@@ -0,0 +1,484 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+
+
+class MyDataset(Dataset):
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+        self.len = self.x.size(0)
+ 
+    def __getitem__(self, index):
+        return self.x[index], self.y[index]
+ 
+    def __len__(self):
+        return self.len
+
+
+class MyDataset_new(Dataset):
+    def __init__(self, x, y, s):
+        self.x = x
+        self.y = y
+        self.s = s
+        self.len = self.x.size(0)
+ 
+    def __getitem__(self, index):
+        return self.x[index], self.y[index], self.s[index]
+ 
+    def __len__(self):
+        return self.len
+
+
+class CLS_Net(torch.nn.Module):
+
+    def __init__(self, cls_num, z_dim, cls_batch_size):
+        super(CLS_Net, self).__init__()
+
+        mini_dim = 256 #256
+
+        out_input_num = mini_dim
+
+        base_dim = 64 #256 #64
+
+        self.cls_batch_size = cls_batch_size
+        self.jie = 1
+
+        self.fc1 = nn.Linear(z_dim, mini_dim)
+        self.fc1.weight.data.normal_(0, 0.1)
+
+        self.fc2 = nn.Linear(out_input_num, base_dim)
+        self.fc2.weight.data.normal_(0, 0.1)
+
+        self.out = nn.Linear(base_dim, cls_num)
+        self.out.weight.data.normal_(0, 0.1)
+
+    def self_dis(self, a):
+        max_dim = self.cls_batch_size
+        jie = self.jie
+
+        all_tag = False
+        for j in range(a.shape[0]):
+            col_tag = False
+            for i in range(a.shape[0]):
+                tmp = F.pairwise_distance(a[j,:], a[i,:] , p = jie).view(-1,1)
+                if col_tag == False:
+                    col_dis = tmp
+                    col_tag = True
+                else:
+                    col_dis = torch.cat((col_dis, tmp), dim = 0)
+            if all_tag == False:
+                all_dis = col_dis
+                all_tag = True
+            else:
+                all_dis = torch.cat((all_dis, col_dis), dim = 1)
+        '''
+        print(all_dis.shape)
+        if all_dis.shape[1] < max_dim:
+            all_dis = torch.cat((all_dis, all_dis[:,:(max_dim - all_dis.shape[1])]), dim = 1)
+        print(all_dis.shape)
+        '''
+        return all_dis
+
+    def forward(self, x):
+
+        x = self.fc1(x)
+        x1 = F.relu(x)
+
+        x2 = self.fc2(x1)
+        x2 = torch.nn.Dropout(0.1)(x2) #0.3
+        x2 = F.relu(x2)
+
+        y = self.out(x2)
+
+        return y, x1
+
+
+class Gen_Net(torch.nn.Module):
+
+    def __init__(self,input_x2_dim, output_dim):
+        super(Gen_Net, self).__init__()
+
+        self.x2_input = nn.Linear(input_x2_dim , 60)
+        self.x2_input.weight.data.normal_(0, 0.1)
+
+        self.fc1 = nn.Linear(60, 128)
+        self.fc1.weight.data.normal_(0, 0.1)
+
+        self.fc2 = nn.Linear(128, 256)
+        self.fc2.weight.data.normal_(0, 0.1)
+
+        self.fc3 = nn.Linear(256, 128)
+        self.fc3.weight.data.normal_(0, 0.1)
+
+        self.out = nn.Linear(128, output_dim)
+        self.out.weight.data.normal_(0, 0.1)
+
+    def forward(self,x2):
+        x2 = self.x2_input(x2)
+
+        x = x2
+        x = self.fc1(x)
+        x = F.relu(x)
+
+        x = self.fc2(x)
+        x = F.relu(x)
+
+        x = self.fc3(x)
+        x = F.relu(x)
+        y = self.out(x)
+
+        return y
+
+
+class gans_process():
+
+    def __init__(self, config):
+        
+        #base pare
+        self.device = config.device
+        self.cls_num = config.cls_num
+        self.x2_dim =  config.noise_dim
+        self.z_dim = config.z_dim
+
+        self.cls_lr = config.cls_lr
+        self.gen_lr = config.gen_lr 
+        self.cls_epoches = config.cls_epoches
+        self.gen_epoches = config.gen_epoches
+        self.mse_weight = 1.0
+
+        self.cls_batch_size = config.cls_batch_size
+        self.gen_batch_size = config.gen_batch_size
+        self.eval_batch_size = config.cls_batch_size
+        self.gen_batch_size = self.cls_batch_size
+
+        #optimer and net
+        self.cls_net = CLS_Net(self.cls_num, self.z_dim, self.cls_batch_size).to(self.device)
+        self.cls_optimizer = torch.optim.SGD(self.cls_net.parameters(), 
+                                             lr = self.cls_lr , weight_decay= 1e-5)
+        # gen net
+        self.gen_net = Gen_Net(self.x2_dim, self.z_dim).to(self.device)
+
+        self.gen_optimizer = torch.optim.SGD(self.gen_net.parameters(), 
+                                             lr = self.gen_lr , weight_decay= 0.01)
+
+        #base loss
+        self.loss_func = torch.nn.CrossEntropyLoss()
+        self.loss_mse = torch.nn.MSELoss()
+
+    def freeze_cls(self):
+        for param in self.cls_net.parameters():
+            param.requires_grad = False
+
+    def unfreeze_cls(self):
+        for param in self.cls_net.parameters():
+            param.requires_grad = True
+
+    def freeze_gen(self):
+        for param in self.gen_net.parameters():
+            param.requires_grad = False
+
+    def unfreeze_gen(self):
+        for param in self.gen_net.parameters():
+            param.requires_grad = True
+    
+    def labels2genx(self, sample_num):
+        x = torch.rand(sample_num, self.x2_dim)
+        return x.to(self.device)
+
+    def pad_batch(self, x):
+        if int(x.shape[0] % self.cls_batch_size) == 0:
+            return x
+        pad_len = self.cls_batch_size  - ( x.shape[0] % self.cls_batch_size)
+        x = torch.cat((x, x[:pad_len]), dim = 0)
+        return x
+
+    def ready_cls(self, sent_output,perm=None):
+        sample_num = len(sent_output)
+        #---------------make fake z---------------
+        sent_output = sent_output.to(self.device)
+        sent_noise = torch.tensor(self.gen_test(sample_num)).to(self.device)
+
+        #--------------handle datas---------------
+        x = torch.cat((sent_output, sent_noise), dim = 0 )
+        if perm is None:
+            perm = torch.randperm(len(x))
+        x = x[perm]
+        #add y - only one label per time
+        multi_label_num = 1
+        multi_output_y = torch.tensor([0]*sample_num).unsqueeze(1)
+        multi_noise_y = torch.zeros([sent_noise.size(0),1], dtype = torch.int)
+        multi_noise_y = multi_noise_y + multi_label_num
+
+        y = torch.cat((multi_output_y, multi_noise_y), dim = 0).to(self.device)
+        y = y[perm]
+        # x_train = x [:self.train_len]
+        # y_train = y [:self.train_len]
+        # x_test =  x [self.train_len:]
+        # y_test = y [self.train_len:]
+        
+        return x,y,None,None,perm
+
+    def ready_fake(self, sent_output, inputs_labels, inputs_indexs, label2id, perm = None):
+
+        #---------------make fake z---------------
+        sent_output = sent_output.to(self.device)
+        sent_noise = torch.tensor(self.gen_test(inputs_labels, inputs_indexs)).to(self.device)
+
+        #--------------handle datas---------------
+        x = sent_noise
+        y = torch.tensor(inputs_labels).unsqueeze(1)
+        if perm is None:
+            perm = torch.randperm(len(x))
+        x = x[perm]
+        y = y[perm]
+
+        return x,y,perm
+
+    def ready_gen(self, sent_output):
+        #, inputs_labels, inputs_indexs
+        sent_num = len(sent_output)
+        sent_output = sent_output.to(self.device)
+        x2 = self.labels2genx(sent_num)
+        y = torch.tensor([0]*sent_num).unsqueeze(1).to(self.device)
+
+        return x2, y, sent_output
+
+    def cls_train(self, x, y, if_oneHot = True):
+        
+        #init
+        self.cls_net.train()
+        self.gen_net.eval()
+
+        self.unfreeze_cls()
+        self.freeze_gen()
+
+        x = x.to(self.device)
+        y = y.to(self.device)
+
+        #if oneHot
+        if if_oneHot:
+            y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1)
+        #make dataset
+        mydataset = MyDataset(x, y)
+        train_loader = DataLoader(dataset=mydataset, 
+                                  batch_size=self.cls_batch_size, shuffle=True)
+
+        #training
+        for epoch in range(self.cls_epoches):
+            losses = []
+            accuracy = []
+            for step, (batch_x, batch_y) in enumerate(train_loader):
+                self.cls_optimizer.zero_grad()
+
+                out, _ = self.cls_net(batch_x)
+                loss = self.loss_func(out, batch_y)
+
+                #One-side label smoothing -not used
+                #location 0 real, location 1 fake
+                batch_y = batch_y * torch.tensor([0.9, 1.0]).to(self.device)
+
+                loss.backward()       
+                self.cls_optimizer.step()
+                #tqdm
+                _, predictions = out.max(1)
+                predictions = predictions.cpu().numpy().tolist()
+                _,real_y = batch_y.max(1)
+                real_y = real_y.cpu().numpy().tolist()
+
+                num_correct = np.sum([int(x==y) for x,y in zip(predictions, real_y)])
+                running_train_acc = float(num_correct) / float(batch_x.shape[0])
+                losses.append(loss)
+                accuracy.append(running_train_acc)
+
+
+        return self.cls_net
+    
+    def cls_eval(self, x, y, if_oneHot = True):
+
+        #init
+        self.cls_net.eval()
+        x = x.to(self.device)
+        y = y.to(self.device)
+
+        #if oneHot
+        if if_oneHot:
+            y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1)
+        #make dataset
+        mydataset = MyDataset(x, y)
+        train_loader = DataLoader(dataset=mydataset, 
+                                  batch_size=self.eval_batch_size, shuffle=False)
+
+        losses = []
+        accuracy = []
+        #evaling
+        for step, (batch_x, batch_y) in enumerate(train_loader):
+            out,_ = self.cls_net(batch_x)
+            loss = self.loss_func(out, batch_y)
+
+            #tqdm
+            _, predictions = out.max(1)
+            predictions = predictions.cpu().numpy().tolist()
+            _,real_y = batch_y.max(1)
+            real_y = real_y.cpu().numpy().tolist()
+
+            num_correct = np.sum([int(x==y) for x,y in zip(predictions, real_y)])
+            running_train_acc = float(num_correct) / float(batch_x.shape[0])
+            accuracy.append(running_train_acc)
+
+
+        mean_acc = np.mean(accuracy)
+        return mean_acc
+
+    def cls_real_eval(self, x, y, if_oneHot = True):
+
+        #init
+        self.cls_net.eval()
+        x = x.to(self.device)
+        y = y.to(self.device)
+
+        #if oneHot
+        if if_oneHot:
+            y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1)
+        #make dataset
+        mydataset = MyDataset(x, y)
+        train_loader = DataLoader(dataset=mydataset, 
+                                  batch_size=self.eval_batch_size, shuffle=False)
+
+        rs = 0
+        alls = 0
+
+        #evaling
+        for step, (batch_x, batch_y) in enumerate(train_loader):
+            out, _ = self.cls_net(batch_x)
+            loss = self.loss_func(out, batch_y)
+
+            #tqdm
+            _, predictions = out.max(1)
+            predictions = predictions.cpu().numpy().tolist()
+            _,real_y = batch_y.max(1)
+            real_y = real_y.cpu().numpy().tolist()
+
+            right_num = np.sum([int( x==y and int(y) != int(self.cls_num-1) ) for x,y in zip(predictions, real_y)])
+            all_num = np.sum([int(int(y) != int(self.cls_num-1) ) for x,y in zip(predictions, real_y)])
+
+            rs = rs + right_num
+            alls = alls + all_num
+
+
+        return rs/alls
+
+    def cls_test(self, x, if_oneHot = True):
+
+        #init
+        self.cls_net.eval()
+        x = x.to(self.device)
+        y = torch.zeros([x.size(0),1], dtype = torch.float).to(self.device)
+
+        #if oneHot
+        if if_oneHot:
+            y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1)
+        #make dataset
+        mydataset = MyDataset(x, y)
+        train_loader = DataLoader(dataset=mydataset, 
+                                  batch_size=self.eval_batch_size, shuffle=False)
+
+        preds = []
+        #testing
+        for step, (batch_x, batch_y) in enumerate(train_loader):
+            out, _ = self.cls_net(batch_x)
+            loss = self.loss_func(out, batch_y)
+
+            #tqdm
+            _, predictions = out.max(1)
+            predictions = predictions.cpu().numpy().tolist()
+            preds.extend(predictions)
+
+        return preds
+
+    def gen_train(self, x2, y, s, times):
+
+        #init
+        self.cls_net.eval()
+        self.gen_net.train()
+        
+        self.freeze_cls()
+        self.unfreeze_gen()
+
+        #y is gen + cls
+        y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1)
+        
+        #make dataset
+        mydataset = MyDataset_new(x2, y, s)
+        train_loader = DataLoader(dataset=mydataset, 
+                                  batch_size=self.gen_batch_size, shuffle=True)
+
+        #training
+        for epoch in range(self.gen_epoches):
+            losses = []
+            accuracy = []
+            for step, (batch_x2, batch_y, batch_s) in enumerate(train_loader):
+                
+                # no zero_grad = make batch_size
+                if step % 6 == 5: #23
+                    self.gen_optimizer.zero_grad()
+
+                out = self.gen_net(batch_x2)
+
+                #fearture matching
+                out, hds = self.cls_net(out)
+                out2, hds2 = self.cls_net(batch_s.float())
+                loss = self.loss_mse(hds, hds2)
+                loss = loss * pow(0.9, times)
+                loss.backward()
+                self.gen_optimizer.step()
+
+                #tqdm
+                _, predictions = out.max(1)
+                predictions = predictions.cpu().numpy().tolist()
+                _, real_y = batch_y.max(1)
+                real_y = real_y.cpu().numpy().tolist()
+
+                num_correct = np.sum([int(x==y) for x,y in zip(predictions, real_y)])
+                running_train_acc = float(num_correct) / float(batch_x2.shape[0])
+                losses.append(loss)
+                accuracy.append(running_train_acc)
+
+        return self.gen_net
+
+    def gen_test(self, sample_num):
+
+        #init
+        self.gen_net.eval()
+        x2 = self.labels2genx(sample_num)
+        #x2: len(inputs_labels) * 80
+        y = torch.zeros([sample_num,1], dtype = torch.float)
+        y = torch.zeros(sample_num, self.z_dim).scatter_(1, y.long(), 1)
+        y = y.to(self.device)
+        s = torch.ones((sample_num, self.z_dim)).to(self.device)
+
+        #make dataset
+        mydataset = MyDataset_new(x2, y, s)
+        train_loader = DataLoader(dataset=mydataset, 
+                                  batch_size=self.eval_batch_size, shuffle=False)
+
+        preds = []
+        #testing
+        for step, (batch_x2, batch_y, batch_s) in enumerate(train_loader):
+
+            out = self.gen_net(batch_x2)
+                
+            loss = self.loss_mse(out.double(), batch_s.double())
+
+            predictions = out.cpu().detach().numpy().tolist()
+            preds.extend(predictions)
+
+        return preds
+
+
+if __name__ == '__main__':
+
+    pass
+
diff --git a/fengshen/models/PPVAE/__init__.py b/fengshen/models/PPVAE/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92b6a8083d4f23a890ebe0c8635a94d0328fcea
--- /dev/null
+++ b/fengshen/models/PPVAE/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PPVAE model. """
diff --git a/fengshen/models/PPVAE/pluginVAE.py b/fengshen/models/PPVAE/pluginVAE.py
new file mode 100644
index 0000000000000000000000000000000000000000..8841d64ca9d2cc63764015053a021103dfee24dd
--- /dev/null
+++ b/fengshen/models/PPVAE/pluginVAE.py
@@ -0,0 +1,180 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+
+from fengshen.models.DAVAE.DAVAEModel import DAVAEModel
+from fengshen.models.PPVAE.utils import *
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class Encoder(nn.Module):
+    def __init__(self, latent_dim=128, bottle_dim=20) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(latent_dim, latent_dim//2)
+        self.fc2 = nn.Linear(latent_dim//2, latent_dim//4)
+        self.mean = nn.Linear(latent_dim//4, bottle_dim)
+        self.log_var = nn.Linear(latent_dim//4, bottle_dim)
+
+    def kl_loss(self, mean, log_var):
+        return (-0.5 * (1 + log_var - mean**2 - log_var.exp()).sum(-1)).mean()
+
+    def sampling(self, mean, log_var):
+        epsilon = torch.randn(mean.shape[0], mean.shape[-1], device=mean.device)
+        return mean + (log_var / 2).exp() * epsilon.unsqueeze(1)
+
+    def forward(self, z):
+        '''
+        :param z: shape (b, latent_dim)
+        '''
+        z = self.fc1(z)
+        z = F.leaky_relu(z)
+        z = F.leaky_relu(self.fc2(z))
+        z_mean = self.mean(z)
+
+        z_log_var = self.log_var(z)
+        kl_loss = self.kl_loss(z_mean, z_log_var)
+        enc_z = self.sampling(z_mean, z_log_var)
+
+        if not self.training:
+            enc_z = z_mean
+        
+        return enc_z, kl_loss
+
+class Decoder(nn.Module):
+    def __init__(self, latent_dim=128, bottle_dim=20) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(bottle_dim, latent_dim//4)
+        self.fc2 = nn.Linear(latent_dim//4, latent_dim//2)
+        self.fc3 = nn.Linear(latent_dim//2, latent_dim)
+
+    def forward(self, enc_z):
+        z = F.leaky_relu(self.fc1(enc_z))
+        z = F.leaky_relu(self.fc2(z))
+        z = self.fc3(z)
+        return z
+
+class PluginVAE(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.kl_weight = config.kl_weight
+        self.beta = config.beta
+        self.encoder = Encoder(config.latent_dim, config.bottle_dim)
+        self.decoder = Decoder(config.latent_dim, config.bottle_dim)
+
+    def set_beta(self, beta):
+        self.beta = beta
+
+    def forward(self, z):
+        enc_z, kl_loss = self.encoder(z)
+        z_out = self.decoder(enc_z)
+        return z_out, kl_loss
+
+    def loss(self, z):
+        z_out, kl_loss = self.forward(z)
+        z_loss = ((z_out-z)**2).mean()
+        loss = z_loss + self.kl_weight * (kl_loss-self.beta).abs()
+        return loss, kl_loss
+
+class PPVAEPretrainedModel(PreTrainedModel):
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        pass  # to bypass the not implement error
+
+class PPVAEModel(PPVAEPretrainedModel):
+    config_class = PretrainedConfig
+    def __init__(self, config:PretrainedConfig) -> None:
+        super().__init__(config=config)
+        self.config =config
+        self.pluginvae = PluginVAE(self.config)
+        self.vae_model = DAVAEModel(self.config)
+
+    def train_plugin(self,encoder_tokenizer,decoder_tokenizer,input_texts,negative_samples=None):
+        # 输入：pluginVAE,label,train_data_dict
+        # 输出：pluginVAE
+        self.vae_model.set_tokenizers(encoder_tokenizer,decoder_tokenizer)
+        pos=self.get_latent(input_texts)
+        pos_batch_size = self.config.batch_size
+        total_epoch = self.config.total_epoch
+        pos_dataset = CustomDataset(pos)
+        pos_dataloader = DataLoader(
+            pos_dataset,
+            batch_size=pos_batch_size,
+            shuffle=True
+        )
+        neg =None
+        if negative_samples is not None:
+            neg=self.get_latent(negative_samples)
+            neg_batch_size = int(pos_batch_size*(neg.shape[0]/pos.shape[0]))
+            neg_dataset = CustomDataset(neg)
+            neg_dataloader = DataLoader(
+                neg_dataset,
+                batch_size=neg_batch_size,
+                shuffle=True
+            )
+        optimizer = torch.optim.Adam(
+            params=self.pluginvae.parameters(),
+            lr=self.config.ppvae_lr, betas=(self.config.mu, self.config.nu)
+        )
+        gamma = self.config.gamma
+        iter_num = 0
+        early_stopper = EarlyStopping()
+        min_loss = 10.0
+        for epoch in range(total_epoch):
+            self.pluginvae.train()
+            total_pos_loss = 0.0
+            total_neg_loss = 0.0
+            total_loss = 0.0
+            total_pos_kl = 0.0
+            for i, data in enumerate(pos_dataloader): 
+                if self.config.get_dymanic_beta:
+                    self.pluginvae.set_beta(self.get_beta_weight(iter_num,self.config.beta,self.config.beta_total_step))
+                iter_num += 1
+                pos_loss,pos_kl = self.pluginvae.loss(data)
+                neg_loss = 0.0
+                if neg is not None:
+                    neg_data = next(iter(neg_dataloader))
+                    neg_loss,loss_kl = self.pluginvae.loss(neg_data)
+                    if neg_loss.item()>self.config.neg_loss_threshold*pos_loss.item():
+                        # print("neg_loss exceed, detached")
+                        neg_loss = neg_loss.detach()
+                    total_neg_loss += neg_loss.item()
+                loss = pos_loss - gamma*neg_loss
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                total_pos_loss += pos_loss.item()
+                total_loss += loss.item()
+                total_pos_kl += pos_kl.item()
+            avg_loss = total_loss/len(pos_dataloader)
+            avg_kl_loss = total_pos_kl/len(pos_dataloader)
+            if avg_loss<min_loss:
+                min_loss = avg_loss
+                early_stopper.counter = 0
+            early_stopper(avg_loss, min_loss)
+            if early_stopper.early_stop:
+                # print(f"stop training at epoch {epoch}")
+                break
+
+    def generate(self,n):
+        latent_z = self.gen_latent(n)
+        text_analogy = self.vae_model.text_from_latent_code_batch(latent_z)
+        return text_analogy
+
+    def get_latent(self,texts):
+        latent = self.vae_model.latent_code_from_text_batch(texts)
+        return latent
+
+    def gen_latent(self,gen_num=5):
+        random_vec = torch.randn((gen_num, self.config.bottle_dim)).to(device)
+        with torch.no_grad():
+            g_vec = self.pluginvae.decoder(random_vec)
+        return g_vec
+
+    def get_beta_weight(self,iter_num,beta,total_step):
+        now_beta_weight = min((beta/total_step)*iter_num, beta)
+        return now_beta_weight
+
diff --git a/fengshen/models/PPVAE/utils.py b/fengshen/models/PPVAE/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e583cbbcadefe31f0ac20d48861811f25dbc44
--- /dev/null
+++ b/fengshen/models/PPVAE/utils.py
@@ -0,0 +1,38 @@
+from torch.utils.data import Dataset
+
+class CustomDataset(Dataset):
+    def __init__(self, data) -> None:
+        super().__init__()
+        self.data = data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        # Get data
+        d = self.data[index]
+        return d
+
+class EarlyStopping():
+    def __init__(self, tolerance=10, min_delta=0):
+
+        self.tolerance = tolerance
+        self.min_delta = min_delta
+        self.counter = 0
+        self.early_stop = False
+
+    def __call__(self, train_loss, min_loss):
+        if (train_loss-min_loss) > self.min_delta:
+            self.counter +=1
+            if self.counter >= self.tolerance:  
+                self.early_stop = True
+
+# def gen_text_from_center(args,plugin_vae, vae_model, decoder_tokenizer,label,epoch,pos):
+#     gen_text = []
+#     latent_z = gen_latent_center(plugin_vae,pos).to(args.device).repeat((1,1))
+#     print("latent_z",latent_z.shape)
+#     text_analogy = text_from_latent_code_batch(latent_z, vae_model, args, decoder_tokenizer)
+#     print("label",label)
+#     print(text_analogy)
+#     gen_text.extend([(label,y,epoch) for y in  text_analogy])
+#     text2out(gen_text, '/cognitive_comp/liangyuxin/projects/cond_vae/outputs/test.json')
\ No newline at end of file
diff --git a/fengshen/models/__init__.py b/fengshen/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb
--- /dev/null
+++ b/fengshen/models/__init__.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/fengshen/models/albert/modeling_albert.py b/fengshen/models/albert/modeling_albert.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c5298825fb471e0575dabaefb2b8514e5bedcd8
--- /dev/null
+++ b/fengshen/models/albert/modeling_albert.py
@@ -0,0 +1,1363 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ALBERT model. """
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers import AlbertConfig
+
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "albert-base-v2"
+_CONFIG_FOR_DOC = "AlbertConfig"
+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
+
+
+ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "albert-base-v1",
+    "albert-large-v1",
+    "albert-xlarge-v1",
+    "albert-xxlarge-v1",
+    "albert-base-v2",
+    "albert-large-v2",
+    "albert-xlarge-v2",
+    "albert-xxlarge-v2",
+    # See all ALBERT models at https://huggingface.co./models?filter=albert
+]
+
+
+def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        print(name)
+
+    for name, array in zip(names, arrays):
+        original_name = name
+
+        # If saved from the TF HUB module
+        name = name.replace("module/", "")
+
+        # Renaming and simplifying
+        name = name.replace("ffn_1", "ffn")
+        name = name.replace("bert/", "albert/")
+        name = name.replace("attention_1", "attention")
+        name = name.replace("transform/", "")
+        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
+        name = name.replace("LayerNorm", "attention/LayerNorm")
+        name = name.replace("transformer/", "")
+
+        # The feed forward layer had an 'intermediate' step which has been abstracted away
+        name = name.replace("intermediate/dense/", "")
+        name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
+
+        # ALBERT attention was split between self and output which have been abstracted away
+        name = name.replace("/output/", "/")
+        name = name.replace("/self/", "/")
+
+        # The pooler is a linear layer
+        name = name.replace("pooler/dense", "pooler")
+
+        # The classifier was simplified to predictions from cls/predictions
+        name = name.replace("cls/predictions", "predictions")
+        name = name.replace("predictions/attention", "predictions")
+
+        # Naming was changed to be more explicit
+        name = name.replace("embeddings/attention", "embeddings")
+        name = name.replace("inner_group_", "albert_layers/")
+        name = name.replace("group_", "albert_layer_groups/")
+
+        # Classifier
+        if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
+            name = "classifier/" + name
+
+        # No ALBERT model currently handles the next sentence prediction task
+        if "seq_relationship" in name:
+            name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
+            name = name.replace("weights", "weight")
+
+        name = name.split("/")
+
+        # Ignore the gradients applied by the LAMB/ADAM optimizers.
+        if (
+            "adam_m" in name
+            or "adam_v" in name
+            or "AdamWeightDecayOptimizer" in name
+            or "AdamWeightDecayOptimizer_1" in name
+            or "global_step" in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print(f"Initialize PyTorch weight {name} from {original_name}")
+        pointer.data = torch.from_numpy(array)
+
+    return model
+
+
+class AlbertEmbeddings(nn.Module):
+    """
+    Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+                persistent=False,
+            )
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class AlbertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads}"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pruned_heads = set()
+
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+    # Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.query = prune_linear_layer(self.query, index)
+        self.key = prune_linear_layer(self.key, index)
+        self.value = prune_linear_layer(self.value, index)
+        self.dense = prune_linear_layer(self.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.num_attention_heads = self.num_attention_heads - len(heads)
+        self.all_head_size = self.attention_head_size * self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(2, 1).flatten(2)
+
+        projected_context_layer = self.dense(context_layer)
+        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
+        layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout)
+        return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,)
+
+
+class AlbertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = AlbertAttention(config)
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
+    ):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
+
+        ffn_output = apply_chunking_to_forward(
+            self.ff_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output[0],
+        )
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
+
+        return (hidden_states,) + attention_output[1:]  # add attentions if we output them
+
+    def ff_chunk(self, attention_output):
+        ffn_output = self.ffn(attention_output)
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+        return ffn_output
+
+
+class AlbertLayerGroup(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
+
+    def forward(
+        self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
+    ):
+        layer_hidden_states = ()
+        layer_attentions = ()
+
+        for layer_index, albert_layer in enumerate(self.albert_layers):
+            layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions)
+            hidden_states = layer_output[0]
+
+            if output_attentions:
+                layer_attentions = layer_attentions + (layer_output[1],)
+
+            if output_hidden_states:
+                layer_hidden_states = layer_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if output_hidden_states:
+            outputs = outputs + (layer_hidden_states,)
+        if output_attentions:
+            outputs = outputs + (layer_attentions,)
+        return outputs  # last-layer hidden state, (layer hidden states), (layer attentions)
+
+
+class AlbertTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
+        self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask
+
+        for i in range(self.config.num_hidden_layers):
+            # Number of layers in a hidden group
+            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
+
+            # Index of the hidden group
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states,
+                attention_mask,
+                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+                output_attentions,
+                output_hidden_states,
+            )
+            hidden_states = layer_group_output[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + layer_group_output[-1]
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class AlbertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = AlbertConfig
+    load_tf_weights = load_tf_weights_in_albert
+    base_model_prefix = "albert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+class AlbertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.AlbertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        sop_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    sop_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+ALBERT_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Args:
+        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+ALBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+)
+class AlbertModel(AlbertPreTrainedModel):
+
+    config_class = AlbertConfig
+    base_model_prefix = "albert"
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+
+        self.config = config
+        self.embeddings = AlbertEmbeddings(config)
+        self.encoder = AlbertTransformer(config)
+        if add_pooling_layer:
+            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+            self.pooler_activation = nn.Tanh()
+        else:
+            self.pooler = None
+            self.pooler_activation = None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
+        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
+        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.
+
+        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
+        while [2,3] correspond to the two inner groups of the second hidden layer.
+
+        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
+        information about head pruning
+        """
+        for layer, heads in heads_to_prune.items():
+            group_idx = int(layer / self.config.inner_group_num)
+            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
+            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # 
+        extended_attention_mask = attention_mask[:, None, :, :]
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `sentence order prediction (classification)` head.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForPreTraining(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.albert = AlbertModel(config)
+        self.predictions = AlbertMLMHead(config)
+        self.sop_classifier = AlbertSOPHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.albert.embeddings.word_embeddings
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        sentence_order_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence
+            A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+            >>> import torch
+
+            >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+            >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> sop_logits = outputs.sop_logits
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+
+        prediction_scores = self.predictions(sequence_output)
+        sop_scores = self.sop_classifier(pooled_output)
+
+        total_loss = None
+        if labels is not None and sentence_order_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
+            total_loss = masked_lm_loss + sentence_order_loss
+
+        if not return_dict:
+            output = (prediction_scores, sop_scores) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return AlbertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            sop_logits=sop_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AlbertMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.LayerNorm = nn.LayerNorm(config.embedding_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+
+        prediction_scores = hidden_states
+
+        return prediction_scores
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        self.bias = self.decoder.bias
+
+
+class AlbertSOPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, pooled_output):
+        dropout_pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(dropout_pooled_output)
+        return logits
+
+
+@add_start_docstrings(
+    "Albert Model with a `language modeling` head on top.",
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForMaskedLM(AlbertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.albert = AlbertModel(config, add_pooling_layer=False)
+        self.predictions = AlbertMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.albert.embeddings.word_embeddings
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_outputs = outputs[0]
+
+        prediction_scores = self.predictions(sequence_outputs)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForSequenceClassification(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.albert = AlbertModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
+            config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForTokenClassification(AlbertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.albert = AlbertModel(config, add_pooling_layer=False)
+        classifier_dropout_prob = (
+            config.classifier_dropout_prob
+            if config.classifier_dropout_prob is not None
+            else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForQuestionAnswering(AlbertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.albert = AlbertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForMultipleChoice(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.albert = AlbertModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/fengshen/models/auto/__init__.py b/fengshen/models/auto/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef185f32cc2d9f9b30db1a6a681ce2df34936351
--- /dev/null
+++ b/fengshen/models/auto/__init__.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "auto_factory": ["get_values"],
+    "configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"],
+    "tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_auto"] = [
+        "AutoModel",
+        "AutoModelForMaskedLM",
+        "AutoModelForMultipleChoice",
+        "AutoModelForPreTraining",
+        "AutoModelForQuestionAnswering",
+        "AutoModelForSequenceClassification",
+        "AutoModelForTokenClassification",
+    ]
+
+if TYPE_CHECKING:
+    from .auto_factory import get_values
+    from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
+    from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+    if is_torch_available():
+        from .modeling_auto import (
+            AutoModel,
+            AutoModelForMaskedLM,
+            AutoModelForMultipleChoice,
+            AutoModelForPreTraining,
+            AutoModelForQuestionAnswering,
+            AutoModelForSequenceClassification,
+            AutoModelForTokenClassification,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/fengshen/models/auto/auto_factory.py b/fengshen/models/auto/auto_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..688bbd4853284305d047be0552077f721e2f97de
--- /dev/null
+++ b/fengshen/models/auto/auto_factory.py
@@ -0,0 +1,644 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Factory function to build auto-model classes."""
+import importlib
+from collections import OrderedDict
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.file_utils import copy_func
+from transformers.utils import logging
+from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
+from .dynamic import get_class_from_dynamic_module
+
+
+logger = logging.get_logger(__name__)
+
+
+CLASS_DOCSTRING = """
+    This is a generic model class that will be instantiated as one of the model classes of the library when created
+    with the [`~BaseAutoModelClass.from_pretrained`] class method or the [`~BaseAutoModelClass.from_config`] class
+    method.
+
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+"""
+
+FROM_CONFIG_DOCSTRING = """
+        Instantiates one of the model classes of the library from a configuration.
+
+        Note:
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
+            model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model weights.
+
+        Args:
+            config ([`PretrainedConfig`]):
+                The model class to instantiate is selected based on the configuration class:
+
+                List options
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+
+        >>> # Download configuration from huggingface.co and cache.
+        >>> config = AutoConfig.from_pretrained("checkpoint_placeholder")
+        >>> model = BaseAutoModelClass.from_config(config)
+        ```
+"""
+
+FROM_PRETRAINED_TORCH_DOCSTRING = """
+        Instantiate one of the model classes of the library from a pretrained model.
+
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+
+        List options
+
+        The model is set in evaluation mode by default using `model.eval()` (so for instance, dropout modules are
+        deactivated). To train the model, you should first set it back in training mode with `model.train()`
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                      model).
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                      save directory.
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            state_dict (*Dict[str, torch.Tensor]*, *optional*):
+                A state dictionary to use instead of a state dictionary loaded from saved weights file.
+
+                This option can be used if you want to create a model from a pretrained configuration but load your own
+                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
+                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_tf (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a TensorFlow checkpoint save file (see docstring of
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            revision(`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (additional keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                      corresponds to a configuration attribute will be used to override said attribute with the
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      will be passed to the underlying model's `__init__` function.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
+
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
+        >>> model.config.output_attentions
+        True
+
+        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        >>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config
+        ... )
+        ```
+"""
+
+FROM_PRETRAINED_TF_DOCSTRING = """
+        Instantiate one of the model classes of the library from a pretrained model.
+
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+
+        List options
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                      model).
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                      save directory.
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_pt (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a PyTorch checkpoint save file (see docstring of
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            revision(`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (additional keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                      corresponds to a configuration attribute will be used to override said attribute with the
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      will be passed to the underlying model's `__init__` function.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
+
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
+        >>> model.config.output_attentions
+        True
+
+        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
+        ... )
+        ```
+"""
+
+FROM_PRETRAINED_FLAX_DOCSTRING = """
+        Instantiate one of the model classes of the library from a pretrained model.
+
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+
+        List options
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                      model).
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                      save directory.
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_pt (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a PyTorch checkpoint save file (see docstring of
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            revision(`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (additional keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                      corresponds to a configuration attribute will be used to override said attribute with the
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      will be passed to the underlying model's `__init__` function.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
+
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
+        >>> model.config.output_attentions
+        True
+
+        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
+        ... )
+        ```
+"""
+
+
+def _get_model_class(config, model_mapping):
+    supported_models = model_mapping[type(config)]
+    if not isinstance(supported_models, (list, tuple)):
+        return supported_models
+
+    name_to_model = {model.__name__: model for model in supported_models}
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in name_to_model:
+            return name_to_model[arch]
+        elif f"TF{arch}" in name_to_model:
+            return name_to_model[f"TF{arch}"]
+        elif f"Flax{arch}" in name_to_model:
+            return name_to_model[f"Flax{arch}"]
+
+    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
+    # defaults.
+    return supported_models[0]
+
+
+class _BaseAutoModelClass:
+    # Base class for auto models.
+    _model_mapping = None
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
+            if not trust_remote_code:
+                raise ValueError(
+                    "Loading this model requires you to execute the modeling file in that repo "
+                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
+                    "the option `trust_remote_code=True` to remove this error."
+                )
+            if kwargs.get("revision", None) is None:
+                logger.warn(
+                    "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
+                    "no malicious code has been contributed in a newer revision."
+                )
+            class_ref = config.auto_map[cls.__name__]
+            module_file, class_name = class_ref.split(".")
+            model_class = get_class_from_dynamic_module(
+                config.name_or_path, module_file + ".py", class_name, **kwargs)
+            return model_class._from_config(config, **kwargs)
+        elif type(config) in cls._model_mapping.keys():
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class._from_config(config, **kwargs)
+
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        kwargs["_from_auto"] = True
+        if not isinstance(config, PretrainedConfig):
+            config, kwargs = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **kwargs
+            )
+        if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
+            if not trust_remote_code:
+                raise ValueError(
+                    f"Loading {pretrained_model_name_or_path} requires you to execute the modeling file in that repo "
+                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
+                    "the option `trust_remote_code=True` to remove this error."
+                )
+            if kwargs.get("revision", None) is None:
+                logger.warn(
+                    "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
+                    "no malicious code has been contributed in a newer revision."
+                )
+            class_ref = config.auto_map[cls.__name__]
+            module_file, class_name = class_ref.split(".")
+            model_class = get_class_from_dynamic_module(
+                pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+            )
+            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+        elif type(config) in cls._model_mapping.keys():
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+        )
+
+    @classmethod
+    def register(cls, config_class, model_class):
+        """
+        Register a new model for this class.
+
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            model_class ([`PreTrainedModel`]):
+                The model to register.
+        """
+        if hasattr(model_class, "config_class") and model_class.config_class != config_class:
+            raise ValueError(
+                "The model class you are passing has a `config_class` attribute that is not consistent with the "
+                f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix "
+                "one of those so they match!"
+            )
+        cls._model_mapping.register(config_class, model_class)
+
+
+def insert_head_doc(docstring, head_doc=""):
+    if len(head_doc) > 0:
+        return docstring.replace(
+            "one of the model classes of the library ",
+            f"one of the model classes of the library (with a {head_doc} head) ",
+        )
+    return docstring.replace(
+        "one of the model classes of the library ", "one of the base model classes of the library "
+    )
+
+
+def auto_class_update(cls, checkpoint_for_example="bert-base-cased", head_doc=""):
+    # Create a new class with the right name from the base class
+    model_mapping = cls._model_mapping
+    name = cls.__name__
+    class_docstring = insert_head_doc(CLASS_DOCSTRING, head_doc=head_doc)
+    cls.__doc__ = class_docstring.replace("BaseAutoModelClass", name)
+
+    # Now we need to copy and re-register `from_config` and `from_pretrained` as class methods otherwise we can't
+    # have a specific docstrings for them.
+    from_config = copy_func(_BaseAutoModelClass.from_config)
+    from_config_docstring = insert_head_doc(
+        FROM_CONFIG_DOCSTRING, head_doc=head_doc)
+    from_config_docstring = from_config_docstring.replace(
+        "BaseAutoModelClass", name)
+    from_config_docstring = from_config_docstring.replace(
+        "checkpoint_placeholder", checkpoint_for_example)
+    from_config.__doc__ = from_config_docstring
+    from_config = replace_list_option_in_docstrings(
+        model_mapping._model_mapping, use_model_types=False)(from_config)
+    cls.from_config = classmethod(from_config)
+
+    if name.startswith("TF"):
+        from_pretrained_docstring = FROM_PRETRAINED_TF_DOCSTRING
+    elif name.startswith("Flax"):
+        from_pretrained_docstring = FROM_PRETRAINED_FLAX_DOCSTRING
+    else:
+        from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING
+    from_pretrained = copy_func(_BaseAutoModelClass.from_pretrained)
+    from_pretrained_docstring = insert_head_doc(
+        from_pretrained_docstring, head_doc=head_doc)
+    from_pretrained_docstring = from_pretrained_docstring.replace(
+        "BaseAutoModelClass", name)
+    from_pretrained_docstring = from_pretrained_docstring.replace(
+        "checkpoint_placeholder", checkpoint_for_example)
+    shortcut = checkpoint_for_example.split("/")[-1].split("-")[0]
+    from_pretrained_docstring = from_pretrained_docstring.replace(
+        "shortcut_placeholder", shortcut)
+    from_pretrained.__doc__ = from_pretrained_docstring
+    from_pretrained = replace_list_option_in_docstrings(
+        model_mapping._model_mapping)(from_pretrained)
+    cls.from_pretrained = classmethod(from_pretrained)
+    return cls
+
+
+def get_values(model_mapping):
+    result = []
+    for model in model_mapping.values():
+        if isinstance(model, (list, tuple)):
+            result += list(model)
+        else:
+            result.append(model)
+
+    return result
+
+
+def getattribute_from_module(module, attr):
+    if attr is None:
+        return None
+    if isinstance(attr, tuple):
+        return tuple(getattribute_from_module(module, a) for a in attr)
+    if hasattr(module, attr):
+        return getattr(module, attr)
+    # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the
+    # object at the top level.
+    transformers_module = importlib.import_module("fengshen")
+    return getattribute_from_module(transformers_module, attr)
+
+
+class _LazyAutoMapping(OrderedDict):
+    """
+    " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
+
+    Args:
+
+        - config_mapping: The map model type to config class
+        - model_mapping: The map model type to model (or tokenizer) class
+    """
+
+    def __init__(self, config_mapping, model_mapping):
+        self._config_mapping = config_mapping
+        self._reverse_config_mapping = {
+            v: k for k, v in config_mapping.items()}
+        self._model_mapping = model_mapping
+        self._extra_content = {}
+        self._modules = {}
+
+    def __getitem__(self, key):
+        if key in self._extra_content:
+            return self._extra_content[key]
+        model_type = self._reverse_config_mapping[key.__name__]
+        if model_type not in self._model_mapping:
+            raise KeyError(key)
+        model_name = self._model_mapping[model_type]
+        return self._load_attr_from_module(model_type, model_name)
+
+    def _load_attr_from_module(self, model_type, attr):
+        module_name = model_type_to_module_name(model_type)
+        if module_name not in self._modules:
+            self._modules[module_name] = importlib.import_module(
+                f".{module_name}", "fengshen.models")
+        return getattribute_from_module(self._modules[module_name], attr)
+
+    def keys(self):
+        mapping_keys = [
+            self._load_attr_from_module(key, name)
+            for key, name in self._config_mapping.items()
+            if key in self._model_mapping.keys()
+        ]
+        return mapping_keys + list(self._extra_content.keys())
+
+    def get(self, key, default):
+        try:
+            return self.__getitem__(key)
+        except KeyError:
+            return default
+
+    def __bool__(self):
+        return bool(self.keys())
+
+    def values(self):
+        mapping_values = [
+            self._load_attr_from_module(key, name)
+            for key, name in self._model_mapping.items()
+            if key in self._config_mapping.keys()
+        ]
+        return mapping_values + list(self._extra_content.values())
+
+    def items(self):
+        mapping_items = [
+            (
+                self._load_attr_from_module(key, self._config_mapping[key]),
+                self._load_attr_from_module(key, self._model_mapping[key]),
+            )
+            for key in self._model_mapping.keys()
+            if key in self._config_mapping.keys()
+        ]
+        return mapping_items + list(self._extra_content.items())
+
+    def __iter__(self):
+        return iter(self.keys())
+
+    def __contains__(self, item):
+        if item in self._extra_content:
+            return True
+        if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping:
+            return False
+        model_type = self._reverse_config_mapping[item.__name__]
+        return model_type in self._model_mapping
+
+    def register(self, key, value):
+        """
+        Register a new model in this mapping.
+        """
+        if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
+            model_type = self._reverse_config_mapping[key.__name__]
+            if model_type in self._model_mapping.keys():
+                raise ValueError(
+                    f"'{key}' is already used by a Transformers model.")
+
+        self._extra_content[key] = value
diff --git a/fengshen/models/auto/configuration_auto.py b/fengshen/models/auto/configuration_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..81676226e57ca519273b98328a1afe6961c37ce3
--- /dev/null
+++ b/fengshen/models/auto/configuration_auto.py
@@ -0,0 +1,403 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Config class."""
+import importlib
+import re
+import warnings
+from collections import OrderedDict
+from typing import List, Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.file_utils import CONFIG_NAME
+from transformers.utils import logging
+from .dynamic import get_class_from_dynamic_module
+
+
+logger = logging.get_logger(__name__)
+
+CONFIG_MAPPING_NAMES = OrderedDict(
+    [
+        # Add configs here
+        ("roformer", "RoFormerConfig"),
+        ("longformer", "LongformerConfig"),
+    ]
+)
+
+CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
+    [
+        # Add archive maps here
+        ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+    ]
+)
+
+MODEL_NAMES_MAPPING = OrderedDict(
+    [
+        # Add full (and cased) model names here
+        ("roformer", "Roformer"),
+        ("longformer", "Longformer"),
+    ]
+)
+
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")])
+
+
+def model_type_to_module_name(key):
+    """Converts a config key to the corresponding module."""
+    # Special treatment
+    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
+        return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+
+    return key.replace("-", "_")
+
+
+def config_class_to_model_type(config):
+    """Converts a config class name to the corresponding model type"""
+    for key, cls in CONFIG_MAPPING_NAMES.items():
+        if cls == config:
+            return key
+    return None
+
+
+class _LazyConfigMapping(OrderedDict):
+    """
+    A dictionary that lazily load its values when they are requested.
+    """
+
+    def __init__(self, mapping):
+        self._mapping = mapping
+        self._extra_content = {}
+        self._modules = {}
+
+    def __getitem__(self, key):
+        if key in self._extra_content:
+            return self._extra_content[key]
+        if key not in self._mapping:
+            raise KeyError(key)
+        value = self._mapping[key]
+        module_name = model_type_to_module_name(key)
+        if module_name not in self._modules:
+            self._modules[module_name] = importlib.import_module(f".{module_name}", "fengshen.models")
+
+        return getattr(self._modules[module_name], value)
+
+    def keys(self):
+        return list(self._mapping.keys()) + list(self._extra_content.keys())
+
+    def values(self):
+        return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values())
+
+    def items(self):
+        return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items())
+
+    def __iter__(self):
+        return iter(list(self._mapping.keys()) + list(self._extra_content.keys()))
+
+    def __contains__(self, item):
+        return item in self._mapping or item in self._extra_content
+
+    def register(self, key, value):
+        """
+        Register a new configuration in this mapping.
+        """
+        if key in self._mapping.keys():
+            raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.")
+        self._extra_content[key] = value
+
+
+CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+
+
+class _LazyLoadAllMappings(OrderedDict):
+    """
+    A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values,
+    etc.)
+
+    Args:
+        mapping: The mapping to load.
+    """
+
+    def __init__(self, mapping):
+        self._mapping = mapping
+        self._initialized = False
+        self._data = {}
+
+    def _initialize(self):
+        if self._initialized:
+            return
+        warnings.warn(
+            "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP is deprecated and will be removed in v5 of Transformers. "
+            "It does not contain all available model checkpoints, far from it. Checkout hf.co/models for that.",
+            FutureWarning,
+        )
+
+        for model_type, map_name in self._mapping.items():
+            module_name = model_type_to_module_name(model_type)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            mapping = getattr(module, map_name)
+            self._data.update(mapping)
+
+        self._initialized = True
+
+    def __getitem__(self, key):
+        self._initialize()
+        return self._data[key]
+
+    def keys(self):
+        self._initialize()
+        return self._data.keys()
+
+    def values(self):
+        self._initialize()
+        return self._data.values()
+
+    def items(self):
+        self._initialize()
+        return self._data.keys()
+
+    def __iter__(self):
+        self._initialize()
+        return iter(self._data)
+
+    def __contains__(self, item):
+        self._initialize()
+        return item in self._data
+
+
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = _LazyLoadAllMappings(CONFIG_ARCHIVE_MAP_MAPPING_NAMES)
+
+
+def _get_class_name(model_class: Union[str, List[str]]):
+    if isinstance(model_class, (list, tuple)):
+        return " or ".join([f"[`{c}`]" for c in model_class if c is not None])
+    return f"[`{model_class}`]"
+
+
+def _list_model_options(indent, config_to_class=None, use_model_types=True):
+    if config_to_class is None and not use_model_types:
+        raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.")
+    if use_model_types:
+        if config_to_class is None:
+            model_type_to_name = {model_type: f"[`{config}`]" for model_type, config in CONFIG_MAPPING_NAMES.items()}
+        else:
+            model_type_to_name = {
+                model_type: _get_class_name(model_class)
+                for model_type, model_class in config_to_class.items()
+                if model_type in MODEL_NAMES_MAPPING
+            }
+        lines = [
+            f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)"
+            for model_type in sorted(model_type_to_name.keys())
+        ]
+    else:
+        config_to_name = {
+            CONFIG_MAPPING_NAMES[config]: _get_class_name(clas)
+            for config, clas in config_to_class.items()
+            if config in CONFIG_MAPPING_NAMES
+        }
+        config_to_model_name = {
+            config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items()
+        }
+        lines = [
+            f"{indent}- [`{config_name}`] configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)"
+            for config_name in sorted(config_to_name.keys())
+        ]
+    return "\n".join(lines)
+
+
+def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True):
+    def docstring_decorator(fn):
+        docstrings = fn.__doc__
+        lines = docstrings.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0]
+            if use_model_types:
+                indent = f"{indent}    "
+            lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types)
+            docstrings = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current docstring is:\n{docstrings}"
+            )
+        fn.__doc__ = docstrings
+        return fn
+
+    return docstring_decorator
+
+
+class AutoConfig:
+    r"""
+    This is a generic configuration class that will be instantiated as one of the configuration classes of the library
+    when created with the [`~AutoConfig.from_pretrained`] class method.
+
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+    @classmethod
+    def for_model(cls, model_type: str, *args, **kwargs):
+        if model_type in CONFIG_MAPPING:
+            config_class = CONFIG_MAPPING[model_type]
+            return config_class(*args, **kwargs)
+        raise ValueError(
+            f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}"
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings()
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate one of the configuration classes of the library from a pretrained model configuration.
+
+        The configuration class to instantiate is selected based on the `model_type` property of the config object that
+        is loaded, or when it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+
+        List options
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                      huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                      namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing a configuration file saved using the
+                      [`~PretrainedConfig.save_pretrained`] method, or the [`~PreTrainedModel.save_pretrained`] method,
+                      e.g., `./my_model_directory/`.
+                    - A path or url to a saved configuration JSON *file*, e.g.,
+                      `./my_model_directory/configuration.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download the model weights and configuration files and override the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision(`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
+
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
+                part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs(additional keyword arguments, *optional*):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoConfig
+
+        >>> # Download configuration from huggingface.co and cache.
+        >>> config = AutoConfig.from_pretrained("bert-base-uncased")
+
+        >>> # Download configuration from huggingface.co (user-uploaded) and cache.
+        >>> config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")
+
+        >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
+        >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/")
+
+        >>> # Load a specific configuration file.
+        >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")
+
+        >>> # Change some config attributes when loading a pretrained config.
+        >>> config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
+        >>> config.output_attentions
+        True
+
+        >>> config, unused_kwargs = AutoConfig.from_pretrained(
+        ...     "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        ... )
+        >>> config.output_attentions
+        True
+
+        >>> config.unused_kwargs
+        {'foo': False}
+        ```"""
+        kwargs["_from_auto"] = True
+        kwargs["name_or_path"] = pretrained_model_name_or_path
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]:
+            if not trust_remote_code:
+                raise ValueError(
+                    f"Loading {pretrained_model_name_or_path} requires you to execute the configuration file in that repo "
+                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
+                    "the option `trust_remote_code=True` to remove this error."
+                )
+            if kwargs.get("revision", None) is None:
+                logger.warn(
+                    "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to "
+                    "ensure no malicious code has been contributed in a newer revision."
+                )
+            class_ref = config_dict["auto_map"]["AutoConfig"]
+            module_file, class_name = class_ref.split(".")
+            config_class = get_class_from_dynamic_module(
+                pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+            )
+            return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif "model_type" in config_dict:
+            config_class = CONFIG_MAPPING[config_dict["model_type"]]
+            return config_class.from_dict(config_dict, **kwargs)
+        else:
+            # Fallback: use pattern matching on the string.
+            for pattern, config_class in CONFIG_MAPPING.items():
+                if pattern in str(pretrained_model_name_or_path):
+                    return config_class.from_dict(config_dict, **kwargs)
+
+        raise ValueError(
+            f"Unrecognized model in {pretrained_model_name_or_path}. "
+            f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings "
+            f"in its name: {', '.join(CONFIG_MAPPING.keys())}"
+        )
+
+    @staticmethod
+    def register(model_type, config):
+        """
+        Register a new configuration for this class.
+
+        Args:
+            model_type (`str`): The model type like "bert" or "gpt".
+            config ([`PretrainedConfig`]): The config to register.
+        """
+        if issubclass(config, PretrainedConfig) and config.model_type != model_type:
+            raise ValueError(
+                "The config you are passing has a `model_type` attribute that is not consistent with the model type "
+                f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they "
+                "match!"
+            )
+        CONFIG_MAPPING.register(model_type, config)
diff --git a/fengshen/models/auto/dynamic.py b/fengshen/models/auto/dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..5760f6e9292195674d7096996cf3cc0ac35aa0c4
--- /dev/null
+++ b/fengshen/models/auto/dynamic.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities to dynamically load model and tokenizer from the Hub."""
+
+import importlib
+import os
+import re
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from transformers.file_utils import (
+    HF_MODULES_CACHE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_offline_mode,
+)
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def init_hf_modules():
+    """
+    Creates the cache directory for modules with an init, and adds it to the Python path.
+    """
+    # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
+    if HF_MODULES_CACHE in sys.path:
+        return
+
+    sys.path.append(HF_MODULES_CACHE)
+    os.makedirs(HF_MODULES_CACHE, exist_ok=True)
+    init_path = Path(HF_MODULES_CACHE) / "__init__.py"
+    if not init_path.exists():
+        init_path.touch()
+
+
+def create_dynamic_module(name: Union[str, os.PathLike]):
+    """
+    Creates a dynamic module in the cache directory for modules.
+    """
+    init_hf_modules()
+    dynamic_module_path = Path(HF_MODULES_CACHE) / name
+    # If the parent module does not exist yet, recursively create it.
+    if not dynamic_module_path.parent.exists():
+        create_dynamic_module(dynamic_module_path.parent)
+    os.makedirs(dynamic_module_path, exist_ok=True)
+    init_path = dynamic_module_path / "__init__.py"
+    if not init_path.exists():
+        init_path.touch()
+
+
+def check_imports(filename):
+    """
+    Check if the current Python environment contains all the libraries that are imported in a file.
+    """
+    with open(filename, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Imports of the form `import xxx`
+    imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
+    # Imports of the form `from xxx import yyy`
+    imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
+    # Only keep the top-level module
+    imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
+
+    # Unique-ify and test we got them all
+    imports = list(set(imports))
+    missing_packages = []
+    for imp in imports:
+        try:
+            importlib.import_module(imp)
+        except ImportError:
+            missing_packages.append(imp)
+
+    if len(missing_packages) > 0:
+        raise ImportError(
+            "This modeling file requires the following packages that were not found in your environment: "
+            f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
+        )
+
+
+def get_class_in_module(class_name, module_path):
+    """
+    Import a module on the cache directory for modules and extract a class from it.
+    """
+    module_path = module_path.replace(os.path.sep, ".")
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def get_class_from_dynamic_module(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    class_name: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Extracts a class from a module file, present in the local folder or repository of a model.
+
+    <Tip warning={true}>
+
+    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
+    therefore only be called on trusted repos.
+
+    </Tip>
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        module_file (`str`):
+            The name of the module file containing the class to look for.
+        class_name (`str`):
+            The name of the class to import in the module.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision(`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    Passing `use_auth_token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `type`: The class, dynamically imported from the module.
+
+    Examples:
+
+    ```python
+    # Download module *modeling.py* from huggingface.co and cache then extract the class *MyBertModel* from this
+    # module.
+    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+    ```"""
+    if is_offline_mode() and not local_files_only:
+        logger.info("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+
+    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isdir(pretrained_model_name_or_path):
+        module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
+        submodule = "local"
+    else:
+        module_file_or_url = hf_bucket_url(
+            pretrained_model_name_or_path, filename=module_file, revision=revision, mirror=None
+        )
+        submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
+
+    try:
+        # Load from URL or cache if already cached
+        resolved_module_file = cached_path(
+            module_file_or_url,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+        )
+
+    except EnvironmentError:
+        logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+        raise
+
+    # Check we have all the requirements in our environment
+    check_imports(resolved_module_file)
+
+    # Now we move the module inside our cached dynamic modules.
+    full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
+    create_dynamic_module(full_submodule)
+    submodule_path = Path(HF_MODULES_CACHE) / full_submodule
+    if submodule == "local":
+        # We always copy local files (we could hash the file to see if there was a change, and give them the name of
+        # that hash, to only copy when there is a modification but it seems overkill for now).
+        # The only reason we do the copy is to avoid putting too many folders in sys.path.
+        module_name = module_file
+        shutil.copy(resolved_module_file, submodule_path / module_file)
+    else:
+        # The module file will end up being named module_file + the etag. This way we get the benefit of versioning.
+        resolved_module_file_name = Path(resolved_module_file).name
+        module_name_parts = [module_file.replace(".py", "")] + resolved_module_file_name.split(".")
+        module_name = "_".join(module_name_parts) + ".py"
+        if not (submodule_path / module_name).exists():
+            shutil.copy(resolved_module_file, submodule_path / module_name)
+
+    # And lastly we get the class inside our newly created module
+    final_module = os.path.join(full_submodule, module_name.replace(".py", ""))
+    return get_class_in_module(class_name, final_module)
diff --git a/fengshen/models/auto/modeling_auto.py b/fengshen/models/auto/modeling_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..3805e86d239d63d826092fa811261b2334e608f7
--- /dev/null
+++ b/fengshen/models/auto/modeling_auto.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class."""
+
+import warnings
+from collections import OrderedDict
+
+from transformers.utils import logging
+from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from .configuration_auto import CONFIG_MAPPING_NAMES
+
+
+logger = logging.get_logger(__name__)
+
+
+MODEL_MAPPING_NAMES = OrderedDict(
+    [
+        # Base model mapping
+        ("roformer", "RoFormerModel"),
+        ("longformer", "LongformerModel"),
+    ]
+)
+
+MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for pre-training mapping
+        ("longformer", "LongformerForMaskedLM"),
+    ]
+)
+
+MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
+    [
+        # Model with LM heads mapping
+        ("roformer", "RoFormerForMaskedLM"),
+        ("longformer", "LongformerForMaskedLM"),
+    ]
+)
+
+MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Causal LM mapping
+        ("roformer", "RoFormerForCausalLM"),
+    ]
+)
+
+
+MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Masked LM mapping
+        ("roformer", "RoFormerForMaskedLM"),
+        ("longformer", "LongformerForMaskedLM"),
+    ]
+)
+
+
+MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Seq2Seq Causal LM mapping
+        ("t5", "T5ForConditionalGeneration"),
+
+    ]
+)
+
+MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
+    [
+        ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
+        ("speech_to_text", "Speech2TextForConditionalGeneration"),
+    ]
+)
+
+MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Sequence Classification mapping
+        ("roformer", "RoFormerForSequenceClassification"),
+        ("longformer", "LongformerForSequenceClassification"),
+    ]
+)
+
+MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Question Answering mapping
+        ("roformer", "RoFormerForQuestionAnswering"),
+        ("longformer", "LongformerForQuestionAnswering"),
+    ]
+)
+
+MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Table Question Answering mapping
+        ("tapas", "TapasForQuestionAnswering"),
+    ]
+)
+
+MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Token Classification mapping
+        ("roformer", "RoFormerForTokenClassification"),
+        ("longformer", "LongformerForTokenClassification"),
+    ]
+)
+
+MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Multiple Choice mapping
+        ("roformer", "RoFormerForMultipleChoice"),
+        ("longformer", "LongformerForMultipleChoice"),
+    ]
+)
+
+
+
+
+MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
+
+MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
+
+MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
+
+MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+
+MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
+
+MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+)
+MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+)
+MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+)
+MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
+)
+MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+)
+MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES)
+
+MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES)
+
+
+
+class AutoModel(_BaseAutoModelClass):
+    _model_mapping = MODEL_MAPPING
+
+
+AutoModel = auto_class_update(AutoModel)
+
+
+class AutoModelForPreTraining(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_PRETRAINING_MAPPING
+
+
+AutoModelForPreTraining = auto_class_update(AutoModelForPreTraining, head_doc="pretraining")
+
+
+# Private on purpose, the public class will add the deprecation warnings.
+class _AutoModelWithLMHead(_BaseAutoModelClass):
+    _model_mapping = MODEL_WITH_LM_HEAD_MAPPING
+
+
+_AutoModelWithLMHead = auto_class_update(_AutoModelWithLMHead, head_doc="language modeling")
+
+
+class AutoModelForCausalLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
+
+
+AutoModelForCausalLM = auto_class_update(AutoModelForCausalLM, head_doc="causal language modeling")
+
+
+class AutoModelForMaskedLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_MASKED_LM_MAPPING
+
+
+AutoModelForMaskedLM = auto_class_update(AutoModelForMaskedLM, head_doc="masked language modeling")
+
+
+class AutoModelForSeq2SeqLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+
+
+AutoModelForSeq2SeqLM = auto_class_update(
+    AutoModelForSeq2SeqLM, head_doc="sequence-to-sequence language modeling", checkpoint_for_example="t5-base"
+)
+
+
+class AutoModelForSequenceClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+
+
+AutoModelForSequenceClassification = auto_class_update(
+    AutoModelForSequenceClassification, head_doc="sequence classification"
+)
+
+
+class AutoModelForQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
+
+
+AutoModelForQuestionAnswering = auto_class_update(AutoModelForQuestionAnswering, head_doc="question answering")
+
+
+class AutoModelForTableQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+
+
+AutoModelForTableQuestionAnswering = auto_class_update(
+    AutoModelForTableQuestionAnswering,
+    head_doc="table question answering",
+    checkpoint_for_example="google/tapas-base-finetuned-wtq",
+)
+
+
+class AutoModelForTokenClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+
+
+AutoModelForTokenClassification = auto_class_update(AutoModelForTokenClassification, head_doc="token classification")
+
+
+class AutoModelForMultipleChoice(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_MULTIPLE_CHOICE_MAPPING
+
+
+AutoModelForMultipleChoice = auto_class_update(AutoModelForMultipleChoice, head_doc="multiple choice")
+
+
+
+class AutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
+
+
+AutoModelForSpeechSeq2Seq = auto_class_update(
+    AutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeing"
+)
+
+
+
+class AutoModelWithLMHead(_AutoModelWithLMHead):
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use "
+            "`AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and "
+            "`AutoModelForSeq2SeqLM` for encoder-decoder models.",
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use "
+            "`AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and "
+            "`AutoModelForSeq2SeqLM` for encoder-decoder models.",
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/fengshen/models/auto/tokenization_auto.py b/fengshen/models/auto/tokenization_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..6555191bef55336708cabc5e9b17c0322318a417
--- /dev/null
+++ b/fengshen/models/auto/tokenization_auto.py
@@ -0,0 +1,449 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Tokenizer class."""
+
+import importlib
+import json
+import os
+from collections import OrderedDict
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.file_utils import (
+    cached_path,
+    get_list_of_files,
+    hf_bucket_url,
+    is_offline_mode,
+    is_sentencepiece_available,
+    is_tokenizers_available,
+)
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+# from ..encoder_decoder import EncoderDecoderConfig
+from .auto_factory import _LazyAutoMapping
+from .configuration_auto import (
+    CONFIG_MAPPING_NAMES,
+    AutoConfig,
+    config_class_to_model_type,
+    model_type_to_module_name,
+    replace_list_option_in_docstrings,
+)
+from .dynamic import get_class_from_dynamic_module
+
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+    # This significantly improves completion suggestion performance when
+    # the transformers package is used with Microsoft's Pylance language server.
+    TOKENIZER_MAPPING_NAMES: OrderedDict[str,
+                                         Tuple[Optional[str], Optional[str]]] = OrderedDict()
+else:
+    TOKENIZER_MAPPING_NAMES = OrderedDict(
+        [
+            ("roformer", ("RoFormerTokenizer", None)),
+            ("longformer", ("LongformerTokenizer", None)),
+        ]
+    )
+
+TOKENIZER_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
+
+CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
+
+
+def tokenizer_class_from_name(class_name: str):
+    if class_name == "PreTrainedTokenizerFast":
+        return PreTrainedTokenizerFast
+
+    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
+        if class_name in tokenizers:
+            module_name = model_type_to_module_name(module_name)
+
+            module = importlib.import_module(
+                f".{module_name}", "transformers.models")
+            return getattr(module, class_name)
+
+    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
+        for tokenizer in tokenizers:
+            if getattr(tokenizer, "__name__", None) == class_name:
+                return tokenizer
+
+    return None
+
+
+def get_tokenizer_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Loads the tokenizer configuration from a pretrained model tokenizer configuration.
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision(`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    Passing `use_auth_token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Dict`: The configuration of the tokenizer.
+
+    Examples:
+
+    ```python
+    # Download configuration from huggingface.co and cache.
+    tokenizer_config = get_tokenizer_config("bert-base-uncased")
+    # This model does not have a tokenizer config so the result will be an empty dict.
+    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
+
+    # Save a pretrained tokenizer locally and you can reload its config
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    tokenizer.save_pretrained("tokenizer-test")
+    tokenizer_config = get_tokenizer_config("tokenizer-test")
+    ```"""
+    if is_offline_mode() and not local_files_only:
+        logger.info("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+
+    # Will raise a ValueError if `pretrained_model_name_or_path` is not a valid path or model identifier
+    repo_files = get_list_of_files(
+        pretrained_model_name_or_path,
+        revision=revision,
+        use_auth_token=use_auth_token,
+        local_files_only=local_files_only,
+    )
+    if TOKENIZER_CONFIG_FILE not in [Path(f).name for f in repo_files]:
+        return {}
+
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isdir(pretrained_model_name_or_path):
+        config_file = os.path.join(
+            pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE)
+    else:
+        config_file = hf_bucket_url(
+            pretrained_model_name_or_path, filename=TOKENIZER_CONFIG_FILE, revision=revision, mirror=None
+        )
+
+    try:
+        # Load from URL or cache if already cached
+        resolved_config_file = cached_path(
+            config_file,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+        )
+
+    except EnvironmentError:
+        logger.info(
+            "Could not locate the tokenizer configuration file, will try to use the model config instead.")
+        return {}
+
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)
+
+
+class AutoTokenizer:
+    r"""
+    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
+    created with the [`AutoTokenizer.from_pretrained`] class method.
+
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r"""
+        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
+
+        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+
+        List options
+
+        Params:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
+                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
+                      applicable to all derived classes)
+            inputs (additional positional arguments, *optional*):
+                Will be passed along to the Tokenizer `__init__()` method.
+            config ([`PretrainedConfig`], *optional*)
+                The configuration object used to dertermine the tokenizer class to instantiate.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download the model weights and configuration files and override the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision(`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            subfolder (`str`, *optional*):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
+                facebook/rag-token-base), specify it here.
+            use_fast (`bool`, *optional*, defaults to `True`):
+                Whether or not to try to load the fast version of the tokenizer.
+            tokenizer_type (`str`, *optional*):
+                Tokenizer type to be loaded.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
+                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
+                `additional_special_tokens`. See parameters in the `__init__()` for more details.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer
+
+        >>> # Download vocabulary from huggingface.co and cache.
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
+        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
+
+        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
+        >>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
+        ```"""
+        config = kwargs.pop("config", None)
+        kwargs["_from_auto"] = True
+
+        use_fast = kwargs.pop("use_fast", True)
+        tokenizer_type = kwargs.pop("tokenizer_type", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
+
+        # First, let's see whether the tokenizer_type is passed so that we can leverage it
+        if tokenizer_type is not None:
+            tokenizer_class = None
+            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(
+                tokenizer_type, None)
+
+            if tokenizer_class_tuple is None:
+                raise ValueError(
+                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
+                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
+                )
+
+            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple
+
+            if use_fast and tokenizer_fast_class_name is not None:
+                tokenizer_class = tokenizer_class_from_name(
+                    tokenizer_fast_class_name)
+
+            if tokenizer_class is None:
+                tokenizer_class = tokenizer_class_from_name(
+                    tokenizer_class_name)
+
+            if tokenizer_class is None:
+                raise ValueError(
+                    f"Tokenizer class {tokenizer_class_name} is not currently imported.")
+
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+
+        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
+        tokenizer_config = get_tokenizer_config(
+            pretrained_model_name_or_path, **kwargs)
+
+        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
+        tokenizer_auto_map = tokenizer_config.get("auto_map")
+
+        # If that did not work, let's try to use the config.
+        if config_tokenizer_class is None:
+            if not isinstance(config, PretrainedConfig):
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+            config_tokenizer_class = config.tokenizer_class
+            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
+                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
+
+        # If we have the tokenizer class from the tokenizer config or the model config we're good!
+        if config_tokenizer_class is not None:
+            tokenizer_class = None
+            if tokenizer_auto_map is not None:
+                if not trust_remote_code:
+                    raise ValueError(
+                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that repo "
+                        "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
+                        "the option `trust_remote_code=True` to remove this error."
+                    )
+                if kwargs.get("revision", None) is None:
+                    logger.warn(
+                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
+                        "no malicious code has been contributed in a newer revision."
+                    )
+
+                if use_fast and tokenizer_auto_map[1] is not None:
+                    class_ref = tokenizer_auto_map[1]
+                else:
+                    class_ref = tokenizer_auto_map[0]
+
+                module_file, class_name = class_ref.split(".")
+                tokenizer_class = get_class_from_dynamic_module(
+                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                )
+
+            elif use_fast and not config_tokenizer_class.endswith("Fast"):
+                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
+                tokenizer_class = tokenizer_class_from_name(
+                    tokenizer_class_candidate)
+            if tokenizer_class is None:
+                tokenizer_class_candidate = config_tokenizer_class
+                tokenizer_class = tokenizer_class_from_name(
+                    tokenizer_class_candidate)
+
+            if tokenizer_class is None:
+                raise ValueError(
+                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
+                )
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+
+        model_type = config_class_to_model_type(type(config).__name__)
+        if model_type is not None:
+            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(
+                config)]
+            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
+                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+            else:
+                if tokenizer_class_py is not None:
+                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+                else:
+                    raise ValueError(
+                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
+                        "in order to use this tokenizer."
+                    )
+
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
+        )
+
+    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
+        """
+        Register a new tokenizer in this mapping.
+
+
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
+                The slow tokenizer to register.
+            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
+                The fast tokenizer to register.
+        """
+        if slow_tokenizer_class is None and fast_tokenizer_class is None:
+            raise ValueError(
+                "You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
+        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
+            raise ValueError(
+                "You passed a fast tokenizer in the `slow_tokenizer_class`.")
+        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
+            raise ValueError(
+                "You passed a slow tokenizer in the `fast_tokenizer_class`.")
+
+        if (
+            slow_tokenizer_class is not None
+            and fast_tokenizer_class is not None
+            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
+            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
+        ):
+            raise ValueError(
+                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
+                "consistent with the slow tokenizer class you passed (fast tokenizer has "
+                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
+                "so they match!"
+            )
+
+        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
+        if config_class in TOKENIZER_MAPPING._extra_content:
+            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
+            if slow_tokenizer_class is None:
+                slow_tokenizer_class = existing_slow
+            if fast_tokenizer_class is None:
+                fast_tokenizer_class = existing_fast
+
+        TOKENIZER_MAPPING.register(
+            config_class, (slow_tokenizer_class, fast_tokenizer_class))
diff --git a/fengshen/models/bart/modeling_bart.py b/fengshen/models/bart/modeling_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a58ac8036fbc0bb9334b083b12a5599950d355
--- /dev/null
+++ b/fengshen/models/bart/modeling_bart.py
@@ -0,0 +1,423 @@
+import warnings
+from pytorch_lightning import LightningModule
+from fengshen.models import transformer_utils
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from transformers.file_utils import *
+from transformers.modeling_outputs import *
+from transformers.models.bart import *
+from transformers.models.bart.modeling_bart import BartClassificationHead
+
+
+_CONFIG_FOR_DOC = "BartConfig"
+
+
+# ------------------------ ZZ: CBart addition ------------------------
+
+
+def _reorder_buffer(attn_cache, new_order):
+    for k, input_buffer_k in attn_cache.items():
+        if input_buffer_k is not None:
+            attn_cache[k] = input_buffer_k.index_select(0, new_order)
+    return attn_cache
+
+
+def _make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+BART_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+
+    Mask filling example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
+"""
+
+
+@dataclass
+class CBartLMOutput(ModelOutput):
+    """
+    Base class for CBart specific language models outputs.
+
+    Args:
+        ....
+    """
+    loss: Optional[torch.FloatTensor] = None
+    encoder_loss: Optional[torch.FloatTensor] = None
+    decoder_loss: Optional[torch.FloatTensor] = None
+    encoder_logits: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class BartForTextInfill(BartPretrainedModel):
+    """
+    this class is designed for text infilling.
+    During training, the encoder is used to predict replace, insert,
+    and the decoder is used to generate original input.
+    Compared with BartForConditionalGeneration class,
+    we add a module over the encoder and add a new loss for the encoder.
+    """
+    base_model_prefix = "model"
+    authorized_missing_keys = [r"final_logits_bias",
+                               r"encoder\.version", r"decoder\.version"]
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        base_model = BartModel(config)
+        self.model = base_model
+        self.register_buffer("final_logits_bias", torch.zeros(
+            (1, self.model.shared.num_embeddings)))
+        # print( config.encoder_loss_type, config.num_labels)
+
+        # add a new attribute into BartConfig class (revise BartConfig)
+        self.encoder_loss_type = config.encoder_loss_type
+        self.num_labels = config.num_labels
+        if self.encoder_loss_type == 0:  # 0 is classification loss, 1 is regression loss
+            # add a classification module for the encoder
+            self.classification_head = BartClassificationHead(
+                config.d_model, config.d_model, config.num_labels, config.classif_dropout,
+            )
+        else:
+            # add a regression module for the encoder
+            self.classification_head = BartClassificationHead(
+                config.d_model, config.d_model, 1, config.classif_dropout,
+            )
+
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+        self.loss_weight = config.loss_weight
+        self.register_buffer("label_weights", torch.zeros((self.num_labels)))
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        old_num_tokens = self.model.shared.num_embeddings
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self.model.shared = new_embeddings
+        self._resize_final_logits_bias(new_num_tokens, old_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) -> None:
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens),
+                                     device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_outputs=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        past_key_values=None,
+        encoder_labels=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=True,
+        **unused,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
+            with labels in ``[0, ..., config.vocab_size]``.
+
+    Returns:
+
+    Conditional generation example::
+
+            # Mask filling only works for bart-large
+            from transformers import BartTokenizer, BartForConditionalGeneration
+            tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+            TXT = "My friends are <mask> but they eat too many carbs."
+
+            model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+            input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            logits = model(input_ids).logits
+
+            masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            probs = logits[0, masked_index].softmax(dim=0)
+            values, predictions = probs.topk(5)
+
+            tokenizer.decode(predictions).split()
+            # ['good', 'great', 'all', 'really', 'very']
+        """
+        if "lm_labels" in unused:
+            warnings.warn(
+                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = unused.pop("lm_labels")
+        if "decoder_cached_states" in unused:
+            warnings.warn(
+                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `decoder_past_key_values` instead.",
+                FutureWarning,
+            )
+            decoder_past_key_values = unused.pop("decoder_cached_states")
+        return_dict = return_dict if return_dict is not None else False
+
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # logits and loss for the encoder
+        # last hidden state
+        encoder_last_hidden_state = outputs['encoder_last_hidden_state']
+        # eos_mask = input_ids.eq(self.config.eos_token_id)
+        # if len(torch.unique(eos_mask.sum(1))) > 1:
+        #     raise ValueError("All examples must have the same number of <eos> tokens.")
+        # sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
+        encoder_logits = self.classification_head(encoder_last_hidden_state)
+        encoder_loss = None
+        if encoder_labels is not None:
+            # classification loss
+            if self.encoder_loss_type == 0:
+                # ZZ: seems like MSE loss does not support weighting, so only CEL has weighting applied for now
+                loss_fct = nn.CrossEntropyLoss(weight=self.label_weights)
+                encoder_loss = loss_fct(
+                    encoder_logits.view(-1, self.config.num_labels), encoder_labels.view(-1))
+            # regression loss
+            else:
+                encoder_logits = encoder_logits.view(
+                    encoder_logits.size(0), -1)
+                encoder_logits = torch.sigmoid(
+                    encoder_logits) * self.num_labels - 0.5
+                loss_fct = nn.MSELoss(reduction='none')
+                _loss = loss_fct(encoder_logits, encoder_labels)
+                encoder_loss = torch.mean(_loss[encoder_labels >= 0])
+                # encoder_loss =_loss[encoder_labels>=0]
+
+        # logits and loss for the decoder
+        lm_logits = F.linear(
+            outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            # TODO(SS): do we need to ignore pad tokens in labels?
+            masked_lm_loss = loss_fct(
+                lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        loss = None
+        if masked_lm_loss is not None and encoder_loss is not None:
+            loss = encoder_loss * self.loss_weight + masked_lm_loss
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CBartLMOutput(
+            loss=loss,
+            encoder_loss=encoder_loss,
+            decoder_loss=masked_lm_loss,
+            encoder_logits=encoder_logits,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs):
+        assert past is not None, "past has to be defined for encoder_outputs"
+
+        encoder_outputs, past_key_values = past
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            # change this to avoid caching (presumably for debugging)
+            "use_cache": use_cache,
+        }
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == 1:
+            self._force_token_ids_generation(logits, self.config.bos_token_id)
+        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_ids_generation(logits, self.config.eos_token_id)
+        return logits
+
+    def _force_token_ids_generation(self, scores, token_ids) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0"""
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        all_but_token_ids_mask = torch.tensor(
+            [x for x in range(self.config.vocab_size) if x not in token_ids],
+            dtype=torch.long,
+            device=next(self.parameters()).device,
+        )
+        assert len(
+            scores.shape) == 2, "scores should be of rank 2 with shape: [batch_size, vocab_size]"
+        scores[:, all_but_token_ids_mask] = -float("inf")
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        ((enc_out, enc_mask), past_key_values) = past
+        reordered_past = []
+        for layer_past in past_key_values:
+            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
+            layer_past_new = {
+                attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
+            }
+            reordered_past.append(layer_past_new)
+
+        new_enc_out = enc_out if enc_out is None else enc_out.index_select(
+            0, beam_idx)
+        new_enc_mask = enc_mask if enc_mask is None else enc_mask.index_select(
+            0, beam_idx)
+
+        past = ((new_enc_out, new_enc_mask), reordered_past)
+        return past
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_output_embeddings(self):
+        return _make_linear_from_emb(self.model.shared)  # make it on the fly
+
+    def get_encoder_logits(self, input_ids, attention_mask=None):
+        # print(input_ids, attention_mask)
+        # encoder_outputs = self.model.get_encoder_outputs(
+        #         self,
+        #         input_ids,
+        #         attention_mask=attention_mask,
+        #         output_attentions=None,
+        #         output_hidden_states=None,
+        #         return_dict=None,
+        #  )
+
+        encoder_outputs = self.model.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        # logits and loss for the encoder
+        # last hidden state
+        encoder_last_hidden_state = encoder_outputs['last_hidden_state']
+        encoder_logits = self.classification_head(encoder_last_hidden_state)
+
+        # classification
+        if self.encoder_loss_type == 0:
+            # probs = torch.softmax(encoder_logits,dim=-1)
+            pass
+        # regression
+        else:
+            encoder_logits = encoder_logits.view(encoder_logits.size(0), -1)
+            encoder_logits = torch.sigmoid(
+                encoder_logits) * self.num_labels - 0.5
+        return encoder_outputs, encoder_logits
+
+
+class CBartLightning(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_args):
+        parser = parent_args.add_argument_group("CBart specific parameters")
+        parser.add_argument('--num_labels', type=int, default=3)
+        parser.add_argument('--encoder_loss_type', type=int, default=0)
+        parser.add_argument('--loss_weight', type=float, default=1.0)
+        parser.add_argument('--label_weights', type=float, nargs='+', default=[1.0, 1.0, 1.0])
+        parser.add_argument('--masked_lm', type=float, default=0)
+        return parent_args
+
+    def __init__(
+        self,
+        args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.model = BartForTextInfill.from_pretrained(args.model_path, num_labels=self.hparams.num_labels,
+                                                       encoder_loss_type=self.hparams.encoder_loss_type,
+                                                       loss_weight=self.hparams.loss_weight,)
+        self.model.label_weights = torch.tensor(
+            self.hparams.label_weights, dtype=torch.half)
+
+    def forward(self, **inputs):
+        return self.model(**inputs)
+
+    def training_step(self, batch, batch_idx):
+        outputs = self(**batch)
+        return outputs
+
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        outputs = self(**batch)
+        val_loss = outputs["loss"]
+
+        return {"loss": val_loss}
+
+    def setup(self, stage=None) -> None:
+        if stage != "fit":
+            return
+        # Get dataloader by calling it - train_dataloader() is called after setup() by default
+        train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+
+        # Calculate total steps
+        tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus)
+        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
+        self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size
+
+    def configure_optimizers(self):
+        transformer_utils.configure_optimizers(self)
diff --git a/fengshen/models/clip/__init__.py b/fengshen/models/clip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcc95802f0a32cf3417a68b64c6e37a83813787
--- /dev/null
+++ b/fengshen/models/clip/__init__.py
@@ -0,0 +1,4 @@
+from .modeling_taiyi_clip import TaiyiCLIPModel
+from .processing_taiyi_clip import TaiyiCLIPProcessor
+
+__all__ = ['TaiyiCLIPModel', 'TaiyiCLIPProcessor']
diff --git a/fengshen/models/clip/configuration_taiyi_clip.py b/fengshen/models/clip/configuration_taiyi_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..46e1645bce1cf72d007dd21868a8fffe44fc41d7
--- /dev/null
+++ b/fengshen/models/clip/configuration_taiyi_clip.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLIP model configuration"""
+
+# from transformers import MegatronBertConfig as BertConfig
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.clip.configuration_clip import CLIPVisionConfig
+import copy
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional
+
+
+if TYPE_CHECKING:
+    from transformers.processing_utils import ProcessorMixin
+    from transformers.utils import TensorType
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TaiyiCLIPConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co./openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "clip"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the CLIPTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the CLIPVisionConfig with default values.")
+
+        self.text_config = BertConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: BertConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+class CLIPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.feature_extractor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
diff --git a/fengshen/models/clip/modeling_taiyi_clip.py b/fengshen/models/clip/modeling_taiyi_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..e759f41caeb9e1dbc7395a372280e1a4b9cdee1d
--- /dev/null
+++ b/fengshen/models/clip/modeling_taiyi_clip.py
@@ -0,0 +1,253 @@
+import torch
+from torch import nn
+from transformers.models.clip.modeling_clip import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    CLIP_START_DOCSTRING,
+    CLIP_TEXT_INPUTS_DOCSTRING,
+    CLIP_VISION_INPUTS_DOCSTRING,
+    CLIP_INPUTS_DOCSTRING,
+    replace_return_docstrings,
+    CLIPVisionConfig,
+    CLIPPreTrainedModel,
+    CLIPVisionTransformer,
+    CLIPOutput,
+    CLIPConfig,
+    clip_loss,
+)
+from typing import Optional, Tuple, Union
+# from transformers import MegatronBertConfig as BertConfig
+# from transformers import MegatronBertModel as BertModel
+from transformers.models.bert.modeling_bert import BertModel
+from transformers.models.bert.configuration_bert import BertConfig
+from .configuration_taiyi_clip import TaiyiCLIPConfig
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class TaiyiCLIPModel(CLIPPreTrainedModel):
+    config_class = TaiyiCLIPConfig
+
+    def __init__(self, config: TaiyiCLIPConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, BertConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = BertModel(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # pooled_output = text_outputs[1]
+        pooled_output = text_outputs[0][:, 0, :]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=CLIPConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds,
+                      image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/fengshen/models/clip/processing_taiyi_clip.py b/fengshen/models/clip/processing_taiyi_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..25350551ee0d5c543cad8a7d759542459cf32cf9
--- /dev/null
+++ b/fengshen/models/clip/processing_taiyi_clip.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for Taiyi-CLIP
+"""
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+
+
+class TaiyiCLIPProcessor(ProcessorMixin):
+    r"""
+    Constructs a Taiyi-CLIP processor which wraps a Taiyi-CLIP feature extractor and a Taiyi-CLIP tokenizer into
+    a single processor.
+
+    [`TaiyiCLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and
+    [`BertTokenizerFast`]. See the [`~TaiyiCLIPProcessor.__call__`] and [`~TaiyiCLIPProcessor.decode`] for more
+    information.
+
+    Args:
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            The feature extractor is a required input.
+        tokenizer ([`BertTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    feature_extractor_class = "CLIPFeatureExtractor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
diff --git a/fengshen/models/deberta_v2/modeling_deberta_v2.py b/fengshen/models/deberta_v2/modeling_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7437a1160cabb7f1446ee3c62bc6fa5a02a59ba
--- /dev/null
+++ b/fengshen/models/deberta_v2/modeling_deberta_v2.py
@@ -0,0 +1,1617 @@
+# coding=utf-8
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeBERTa-v2 model."""
+
+import math
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import softmax_backward_data
+from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from transformers import DebertaV2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DebertaV2Config"
+_TOKENIZER_FOR_DOC = "DebertaV2Tokenizer"
+_CHECKPOINT_FOR_DOC = "microsoft/deberta-v2-xlarge"
+
+DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/deberta-v2-xlarge",
+    "microsoft/deberta-v2-xxlarge",
+    "microsoft/deberta-v2-xlarge-mnli",
+    "microsoft/deberta-v2-xxlarge-mnli",
+]
+
+
+# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
+class ContextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+        return pooled_output
+
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
+class XSoftmax(torch.autograd.Function):
+    """
+    Masked Softmax which is optimized for saving memory
+
+    Args:
+        input (`torch.tensor`): The input tensor that will apply softmax.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        dim (int): The dimension that will apply softmax
+
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+
+    >>> # Make a tensor
+    >>> x = torch.randn([4, 20, 100])
+
+    >>> # Create a mask
+    >>> mask = (x > 0).int()
+
+    >>> # Specify the dimension to apply softmax
+    >>> dim = -1
+
+    >>> y = XSoftmax.apply(x, mask, dim)
+    ```"""
+
+    @staticmethod
+    def forward(self, input, mask, dim):
+        self.dim = dim
+        rmask = ~(mask.to(torch.bool))
+
+        output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
+        output = torch.softmax(output, self.dim)
+        output.masked_fill_(rmask, 0)
+        self.save_for_backward(output)
+        return output
+
+    @staticmethod
+    def backward(self, grad_output):
+        (output,) = self.saved_tensors
+        inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
+        return inputGrad, None, None
+
+    @staticmethod
+    def symbolic(g, self, mask, dim):
+        import torch.onnx.symbolic_helper as sym_help
+        from torch.onnx.symbolic_opset9 import masked_fill, softmax
+
+        mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
+        r_mask = g.op(
+            "Cast",
+            g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
+            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+        )
+        output = masked_fill(g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.dtype).min)))
+        output = softmax(g, output, dim)
+        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8)))
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
+class DropoutContext(object):
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+
+
+# Copied from transformers.models.deberta.modeling_deberta.get_mask
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+
+    if dropout > 0 and mask is None:
+        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
+
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+
+    return mask, dropout
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XDropout
+class XDropout(torch.autograd.Function):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask,) = ctx.saved_tensors
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+
+
+# Copied from transformers.models.deberta.modeling_deberta.StableDropout
+class StableDropout(nn.Module):
+    """
+    Optimized dropout module for stabilizing the training
+
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+
+    def forward(self, x):
+        """
+        Call the module
+
+        Args:
+            x (`torch.tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2SelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
+class DebertaV2Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = DebertaV2SelfOutput(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            self_output, att_matrix = self_output
+        if query_states is None:
+            query_states = hidden_states
+        attention_output = self.output(self_output, query_states)
+
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
+class DebertaV2Intermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2Output(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2
+class DebertaV2Layer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = DebertaV2Attention(config)
+        self.intermediate = DebertaV2Intermediate(config)
+        self.output = DebertaV2Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = getattr(config, "conv_kernel_size", 3)
+        groups = getattr(config, "conv_groups", 1)
+        self.conv_act = getattr(config, "conv_act", "tanh")
+        self.conv = nn.Conv1d(
+            config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
+        )
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
+        rmask = (1 - input_mask).bool()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+
+        layer_norm_input = residual_states + out
+        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
+
+        if input_mask is None:
+            output_states = output
+        else:
+            if input_mask.dim() != layer_norm_input.dim():
+                if input_mask.dim() == 4:
+                    input_mask = input_mask.squeeze(1).squeeze(1)
+                input_mask = input_mask.unsqueeze(2)
+
+            input_mask = input_mask.to(output.dtype)
+            output_states = output * input_mask
+
+        return output_states
+
+
+class DebertaV2Encoder(nn.Module):
+    """Modified BertEncoder with relative position bias support"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, "relative_attention", False)
+
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+
+            self.position_buckets = getattr(config, "position_buckets", -1)
+            pos_ebd_size = self.max_relative_positions * 2
+
+            if self.position_buckets > 0:
+                pos_ebd_size = self.position_buckets * 2
+
+            self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
+
+        self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
+
+        if "layer_norm" in self.norm_rel_ebd:
+            self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
+
+        self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
+        self.gradient_checkpointing = False
+
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
+            rel_embeddings = self.LayerNorm(rel_embeddings)
+        return rel_embeddings
+
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
+            attention_mask = attention_mask.byte()
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+
+        return attention_mask
+
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
+            relative_pos = build_relative_position(
+                q, hidden_states.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions
+            )
+        return relative_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=True,
+    ):
+        if attention_mask.dim() <= 2:
+            input_mask = attention_mask
+        else:
+            input_mask = (attention_mask.sum(-2) > 0).byte()
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        output_states = next_kv
+        for i, layer_module in enumerate(self.layer):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                output_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    next_kv,
+                    attention_mask,
+                    query_states,
+                    relative_pos,
+                    rel_embeddings,
+                )
+            else:
+                output_states = layer_module(
+                    next_kv,
+                    attention_mask,
+                    query_states=query_states,
+                    relative_pos=relative_pos,
+                    rel_embeddings=rel_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            if output_attentions:
+                output_states, att_m = output_states
+
+            if i == 0 and self.conv is not None:
+                output_states = self.conv(hidden_states, output_states, input_mask)
+
+            if query_states is not None:
+                query_states = output_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
+            else:
+                next_kv = output_states
+
+            if output_attentions:
+                all_attentions = all_attentions + (att_m,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (output_states,)
+
+        if not return_dict:
+            return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+    sign = np.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = np.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, np.abs(relative_pos))
+    log_pos = np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1)) + mid
+    bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int)
+    return bucket_pos
+
+
+def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1):
+    """
+    Build relative position according to the query and key
+
+    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+    P_k\\)
+
+    Args:
+        query_size (int): the length of query
+        key_size (int): the length of key
+        bucket_size (int): the size of position bucket
+        max_position (int): the maximum allowed absolute position
+
+    Return:
+        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+
+    """
+    q_ids = np.arange(0, query_size)
+    k_ids = np.arange(0, key_size)
+    rel_pos_ids = q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0], 1))
+    if bucket_size > 0 and max_position > 0:
+        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+    rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+    rel_pos_ids = rel_pos_ids[:query_size, :]
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
+
+
+class DisentangledSelfAttention(nn.Module):
+    """
+    Disentangled self-attention module
+
+    Parameters:
+        config (`DebertaV2Config`):
+            A model config class instance with the configuration to build a new model. The schema is similar to
+            *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+
+        self.share_att_key = getattr(config, "share_att_key", False)
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+        self.relative_attention = getattr(config, "relative_attention", False)
+
+        if self.relative_attention:
+            self.position_buckets = getattr(config, "position_buckets", -1)
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets > 0:
+                self.pos_ebd_size = self.position_buckets
+
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+
+            if not self.share_att_key:
+                if "c2p" in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+                if "p2c" in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        """
+        Call the module
+
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
+                *Attention(Q,K,V)*
+
+            attention_mask (`torch.ByteTensor`):
+                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+                th token.
+
+            output_attentions (`bool`, optional):
+                Whether return the attention matrix.
+
+            query_states (`torch.FloatTensor`, optional):
+                The *Q* state in *Attention(Q,K,V)*.
+
+            relative_pos (`torch.LongTensor`):
+                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+                values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+            rel_embeddings (`torch.FloatTensor`):
+                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+                \\text{max_relative_positions}\\), *hidden_size*].
+
+
+        """
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
+        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
+        value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if "c2p" in self.pos_att_type:
+            scale_factor += 1
+        if "p2c" in self.pos_att_type:
+            scale_factor += 1
+        scale = math.sqrt(query_layer.size(-1) * scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(
+                query_layer, key_layer, relative_pos, rel_embeddings, scale_factor
+            )
+
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+        attention_scores = attention_scores
+        attention_scores = attention_scores.view(
+            -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1)
+        )
+
+        # bsz x height x length x dimension
+        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.bmm(
+            attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer
+        )
+        context_layer = (
+            context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1))
+            .permute(0, 2, 1, 3)
+            .contiguous()
+        )
+        new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(
+                q, key_layer.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions
+            )
+        if relative_pos.dim() == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim() == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bsz x height x query x key
+        elif relative_pos.dim() != 4:
+            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
+
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.long().to(query_layer.device)
+
+        rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(
+                self.query_proj(rel_embeddings), self.num_attention_heads
+            ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1)
+            pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads).repeat(
+                query_layer.size(0) // self.num_attention_heads, 1, 1
+            )
+        else:
+            if "c2p" in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(
+                    self.pos_key_proj(rel_embeddings), self.num_attention_heads
+                ).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1
+                )  # .split(self.all_head_size, dim=-1)
+            if "p2c" in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(
+                    self.pos_query_proj(rel_embeddings), self.num_attention_heads
+                ).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1
+                )  # .split(self.all_head_size, dim=-1)
+
+        score = 0
+        # content->position
+        if "c2p" in self.pos_att_type:
+            scale = math.sqrt(pos_key_layer.size(-1) * scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = torch.gather(
+                c2p_att,
+                dim=-1,
+                index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]),
+            )
+            score += c2p_att / scale
+
+        # position->content
+        if "p2c" in self.pos_att_type:
+            scale = math.sqrt(pos_query_layer.size(-1) * scale_factor)
+            if key_layer.size(-2) != query_layer.size(-2):
+                r_pos = build_relative_position(
+                    key_layer.size(-2),
+                    key_layer.size(-2),
+                    bucket_size=self.position_buckets,
+                    max_position=self.max_relative_positions,
+                ).to(query_layer.device)
+                r_pos = r_pos.unsqueeze(0)
+            else:
+                r_pos = relative_pos
+
+            p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+            p2c_att = torch.gather(
+                p2c_att,
+                dim=-1,
+                index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)]),
+            ).transpose(-1, -2)
+            score += p2c_att / scale
+
+        return score
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm
+class DebertaV2Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        pad_token_id = getattr(config, "pad_token_id", 0)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+        self.position_biased_input = getattr(config, "position_biased_input", True)
+        if not self.position_biased_input:
+            self.position_embeddings = None
+        else:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+
+        if self.embedding_size != config.hidden_size:
+            self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if self.position_embeddings is not None:
+            position_embeddings = self.position_embeddings(position_ids.long())
+        else:
+            position_embeddings = torch.zeros_like(inputs_embeds)
+
+        embeddings = inputs_embeds
+        if self.position_biased_input:
+            embeddings += position_embeddings
+        if self.config.type_vocab_size > 0:
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+
+        if self.embedding_size != self.config.hidden_size:
+            embeddings = self.embed_proj(embeddings)
+
+        embeddings = self.LayerNorm(embeddings)
+
+        # if mask is not None:
+        #     if mask.dim() != embeddings.dim():
+        #         if mask.dim() == 4:
+        #             mask = mask.squeeze(1).squeeze(1)
+        #         mask = mask.unsqueeze(2)
+        #     mask = mask.to(embeddings.dtype)
+
+        #     embeddings = embeddings * mask
+
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
+class DebertaV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DebertaV2Config
+    base_model_prefix = "deberta"
+    _keys_to_ignore_on_load_missing = ["position_ids"]
+    _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DebertaV2Encoder):
+            module.gradient_checkpointing = value
+
+
+DEBERTA_START_DOCSTRING = r"""
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.```
+
+
+    Parameters:
+        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = DebertaV2Embeddings(config)
+        self.encoder = DebertaV2Encoder(config)
+        self.z_steps = 0
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
+
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        encoded_layers = encoder_outputs[1]
+
+        if self.z_steps > 1:
+            hidden_states = encoded_layers[-2]
+            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+            query_states = encoded_layers[-1]
+            rel_embeddings = self.encoder.get_rel_embedding()
+            attention_mask = self.encoder.get_attention_mask(attention_mask)
+            rel_pos = self.encoder.get_rel_pos(embedding_output)
+            for layer in layers[1:]:
+                query_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=False,
+                    query_states=query_states,
+                    relative_pos=rel_pos,
+                    rel_embeddings=rel_embeddings,
+                )
+                encoded_layers.append(query_states)
+
+        sequence_output = encoded_layers[-1]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+class DebertaV2PredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+class DebertaV2LMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
+class DebertaV2OnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
+class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, "num_labels", 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, num_labels)
+        drop_out = getattr(config, "cls_dropout", None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    # regression task
+                    loss_fn = nn.MSELoss()
+                    logits = logits.view(-1).to(labels.dtype)
+                    loss = loss_fn(logits, labels.view(-1))
+                elif labels.dim() == 1 or labels.size(-1) == 1:
+                    label_index = (labels >= 0).nonzero()
+                    labels = labels.long()
+                    if label_index.size(0) > 0:
+                        labeled_logits = torch.gather(
+                            logits, 0, label_index.expand(label_index.size(0), logits.size(1))
+                        )
+                        labels = torch.gather(labels, 0, label_index.view(-1))
+                        loss_fct = CrossEntropyLoss()
+                        loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
+                    else:
+                        loss = torch.tensor(0).to(logits)
+                else:
+                    log_softmax = nn.LogSoftmax(-1)
+                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+            elif self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
+class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, "num_labels", 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, 1)
+        drop_out = getattr(config, "cls_dropout", None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.deberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/fengshen/models/deepVAE/__init__.py b/fengshen/models/deepVAE/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf019eaf0b04fd1c23d0d51d3ea0f1b62d1c306
--- /dev/null
+++ b/fengshen/models/deepVAE/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Della model. """
diff --git a/fengshen/models/deepVAE/configuration_della.py b/fengshen/models/deepVAE/configuration_della.py
new file mode 100644
index 0000000000000000000000000000000000000000..332e6d71863c3f7266477ea3691a8226b602df01
--- /dev/null
+++ b/fengshen/models/deepVAE/configuration_della.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Della model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+Della_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Della-226M-base": "https://huggingface.co./IDEA-CCNL/Randeng-DELLA-226M-Chinese/resolve/main/config.json"
+}
+
+
+class DellaModelConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~DellaModel`].
+    It is used to instantiate an DellaModel model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the DellaModel [Randeng-DELLA-226M-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-DELLA-226M-Chinese) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Della model. Defines the number of different
+            tokens that can be represented by the
+            `inputs_ids` passed when calling [`~DellaModel`] or
+            [`~TFDellaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~DellaModel`] or
+            [`~TFDellaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+"""
+    model_type = "DellaModel"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        bos_token_id=21128,
+        eos_token_id=21129,
+        pad_token_id=0,
+        CVAE=False,
+        latent_dim=256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.CVAE = CVAE
+        self.latent_dim = latent_dim
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
diff --git a/fengshen/models/deepVAE/deep_vae.py b/fengshen/models/deepVAE/deep_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f03849469375d6f45eb26321b257b674250e77
--- /dev/null
+++ b/fengshen/models/deepVAE/deep_vae.py
@@ -0,0 +1,258 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Della model. """
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from fengshen.models.deepVAE.configuration_della import DellaModelConfig
+from fengshen.models.deepVAE.latent_connector import GPT2ForDecoderLatentConnector, GPT2ForEncoderLatentConnector
+from fengshen.models.deepVAE.utils import connect, compute_kl_loss, top_k_top_p_filtering, enforce_repetition_penalty
+
+
+_CHECKPOINT_FOR_DOC = "della-226M-base"
+_CONFIG_FOR_DOC = "DellaModelConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+Della_model_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "della-226M-base"
+]
+
+
+@dataclass
+class DellaModelOutput(ModelOutput):
+    logits: torch.FloatTensor = None
+    posterior_latents: Optional[Tuple[torch.FloatTensor]] = None
+    prior_latent: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class latent_layer(nn.Module):
+    def __init__(self, input_dim) -> None:
+        super().__init__()
+        self.W_hh = nn.Linear(input_dim, input_dim, bias=False)
+        self.W_ih = nn.Linear(input_dim, input_dim, bias=False)
+        self.tanh = nn.Tanh()
+
+    def forward(self, z_lt_lm1, z_lm1):
+        # inputs are z_<l-1 and z_l-1
+        return self.tanh(self.W_hh(z_lt_lm1) + self.W_ih(z_lm1))
+
+
+class AverageSelfAttention(nn.Module):
+    def __init__(self, hidden_dim):
+        super(AverageSelfAttention, self).__init__()
+        w = torch.empty(hidden_dim)
+        nn.init.normal_(w, std=0.02)
+        self.attention_weights = nn.Parameter(w)
+        self.softmax = nn.Softmax(dim=-1)
+        self.non_linearity = torch.tanh
+
+    def forward(self, inputs, attention_mask=None):
+        scores = self.non_linearity(inputs.matmul(self.attention_weights))
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        scores = self.softmax(scores)
+        weighted = torch.mul(inputs, scores.unsqueeze(-1).expand_as(inputs))
+        representations = weighted.sum(1).squeeze(1)
+
+        return representations, scores
+
+
+class DeepVAE(nn.Module):
+    """DeepVAE with recursive latent z extracted from every layer of encoder and applied on every layer of decoder """
+
+    def __init__(self, encoder, decoder, latent_dim, hidden_dim, layer_num, pad_token_id, bos_token_id, eos_token_id, CVAE):
+        super(DeepVAE, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.latent_dim = latent_dim
+        self.layer_num = layer_num
+        self.CVAE = CVAE
+        # the first layer of latent net depends on zero vectors and therefore can be ignored
+        self.latent_nets = nn.ModuleList([latent_layer(latent_dim) for _ in range(layer_num-1)])
+        post_input_dim = hidden_dim+latent_dim if not CVAE else 2*hidden_dim+latent_dim
+        prior_input_dim = latent_dim if not CVAE else hidden_dim+latent_dim
+        self.posterior_nets = nn.ModuleList([nn.Linear(post_input_dim, 2*latent_dim, bias=False) for _ in range(layer_num)])
+        self.prior_nets = nn.ModuleList([nn.Linear(prior_input_dim, 2*latent_dim, bias=False) for _ in range(layer_num)])
+        # pooling because we are not using hidden states of BOS token
+        self.pooling = nn.ModuleList([AverageSelfAttention(hidden_dim) for _ in range(layer_num)])
+
+    def get_decoder_loss(self, inputs, layer_latent_vecs, cond_inputs):
+        loss_mask = None
+        dec_inputs = inputs
+        if self.CVAE:
+            loss_mask = torch.concat((torch.zeros_like(cond_inputs), torch.ones_like(inputs)), dim=1)
+            dec_inputs = torch.concat((cond_inputs, inputs), dim=1)
+        rec_loss = self.decoder(input_ids=dec_inputs, layer_latent_vecs=layer_latent_vecs,
+                                labels=dec_inputs, label_ignore=self.pad_token_id, loss_mask=loss_mask).loss
+        rec_loss = rec_loss / torch.sum(inputs != self.pad_token_id, dim=1)  # ignore both the pad token id and the cond inputs
+        return rec_loss.mean()
+
+    def get_latent_vecs(self, layer_hidden_states, sample=True, beta_logvar=1., cond_inputs=None):
+        prior_z_list, posterior_z_list = [], []
+        prior_output_list, posterior_output_list = [], []
+        batch_size = layer_hidden_states[0].shape[0]
+        z = torch.zeros((batch_size, self.latent_dim), dtype=layer_hidden_states[0].dtype, device=layer_hidden_states[0].device)
+        for layer_idx in range(self.layer_num):
+            # TODO be more specific about the pooling range, ignore the pad_token_ids could improve the repr of sent or cond inputs
+            if self.CVAE:
+                cond_length = cond_inputs.shape[-1]
+                cond_repr, _ = self.pooling[layer_idx](layer_hidden_states[layer_idx][:, :cond_length, :])
+                sent_repr, _ = self.pooling[layer_idx](layer_hidden_states[layer_idx][:, cond_length:, :])
+                prior_input = torch.cat([cond_repr, z], dim=1)
+                posterior_input = torch.cat([cond_repr, sent_repr, z], dim=1)
+            else:
+                sent_repr, _ = self.pooling[layer_idx](layer_hidden_states[layer_idx])
+                prior_input = z
+                posterior_input = torch.cat([sent_repr, z], dim=1)
+
+            prior_net_output = self.prior_nets[layer_idx](prior_input)
+            posterior_net_output = self.posterior_nets[layer_idx](posterior_input).squeeze(dim=1)
+            prior_z = connect(mean=prior_net_output[:, :self.latent_dim], logvar=prior_net_output[:, self.latent_dim:], sample=sample)
+            posterior_z = connect(mean=posterior_net_output[:, :self.latent_dim], logvar=posterior_net_output[:, self.latent_dim:],
+                                  sample=sample, beta_logvar=beta_logvar)
+            if layer_idx != self.layer_num - 1:
+                z = self.latent_nets[layer_idx](z, posterior_z)  # we skip than last iteration
+            # save the outputs for decoder and kl loss calculations
+            prior_z_list.append(prior_z)
+            posterior_z_list.append(posterior_z)
+            prior_output_list.append(prior_net_output)
+            posterior_output_list.append(posterior_net_output)
+        return prior_z_list, posterior_z_list, prior_output_list, posterior_output_list
+
+    def get_kl_loss(self, prior_output_list, posterior_output_list, beta_kl_constraints):
+        total_kl_loss = None
+        layer_kl_loss = []
+        for prior_output, posterior_output in zip(prior_output_list, posterior_output_list):
+            kl_loss = compute_kl_loss(posterior_output[:, :self.latent_dim], posterior_output[:, self.latent_dim:],
+                                      prior_output[:, :self.latent_dim], prior_output[:, self.latent_dim:])
+            # incase of overflow and nan value we shall clip the loss here
+            # kl_loss = torch.clip(kl_loss, max=1e4)
+            total_kl_loss = kl_loss if total_kl_loss is None else total_kl_loss+kl_loss
+            layer_kl_loss.append(kl_loss)
+        return total_kl_loss.mean() * beta_kl_constraints, layer_kl_loss
+
+    def forward(self, inputs, beta_kl_constraints, cond_inputs=None):
+        # handle cond_inputs differently
+        enc_inputs = torch.concat((cond_inputs, inputs), dim=1) if self.CVAE else inputs
+        encoder_outputs = self.encoder(input_ids=enc_inputs)
+        # hidden_states are tuples with length layer_num+1 and each tensor has shape (batch_size, sequence_length, hidden_size), embedding layer is ignored
+        prior_z_list, posterior_z_list, prior_output_list, posterior_output_list = self.get_latent_vecs(
+            encoder_outputs.hidden_states[1:], cond_inputs=cond_inputs)
+        total_kl_loss, layer_kl_loss = self.get_kl_loss(prior_output_list, posterior_output_list, beta_kl_constraints)
+        # pass the posterior to decoder for layer-wise low rank tensor product
+        rec_loss = self.get_decoder_loss(inputs, posterior_z_list, cond_inputs)
+        return total_kl_loss+rec_loss, rec_loss, total_kl_loss, layer_kl_loss
+
+    def get_cond_prior_vecs(self, layer_hidden_states, cond_inputs, sample=True, beta_logvar=1.):
+        prior_z_list, prior_output_list = [], []
+        batch_size = layer_hidden_states[0].shape[0]
+        z = torch.zeros((batch_size, self.latent_dim), dtype=layer_hidden_states[0].dtype, device=layer_hidden_states[0].device)
+        for layer_idx in range(self.layer_num):
+            # TODO be more specific about the pooling range, ignore the pad_token_ids could improve the repr of sent or cond inputs
+            cond_length = cond_inputs.shape[-1]
+            cond_repr, _ = self.pooling[layer_idx](layer_hidden_states[layer_idx][:, :cond_length, :])
+            prior_input = torch.cat([cond_repr, z], dim=1)
+            prior_net_output = self.prior_nets[layer_idx](prior_input)
+            prior_z = connect(mean=prior_net_output[:, :self.latent_dim], logvar=prior_net_output[:, self.latent_dim:],
+                              sample=sample, beta_logvar=beta_logvar)
+            if layer_idx != self.layer_num - 1:
+                z = self.latent_nets[layer_idx](z, prior_z)  # we skip than last iteration
+            # save the outputs for decoder and kl loss calculations
+            prior_z_list.append(prior_z)
+            prior_output_list.append(prior_net_output)
+        return prior_z_list, prior_output_list
+
+    def inference(self, inputs, top_p, max_length, top_k=0., temperature=1., repetition_penalty=1., sample=False, beta_logvar=1.):
+        # NOTE: if we want to use BOS hidden states for x repr then we need to change the causal mask in attention block.
+        encoder_outputs = self.encoder(input_ids=inputs)
+        # hidden_states are tuples with length layer_num+1 and each tensor has shape (batch_size, sequence_length, hidden_size), embedding layer is ignored
+        if self.CVAE:
+            prior_z_list, prior_output_list = self.get_cond_prior_vecs(encoder_outputs.hidden_states[1:], inputs, sample=sample, beta_logvar=beta_logvar)
+            latent_vecs = prior_z_list
+            generated = inputs
+        else:
+            prior_z_list, posterior_z_list, prior_output_list, posterior_output_list = self.get_latent_vecs(encoder_outputs.hidden_states[1:], sample=sample, beta_logvar=beta_logvar)
+            latent_vecs = posterior_z_list
+            generated = [[self.bos_token_id] for _ in range(inputs.shape[0])]
+            generated = torch.tensor(generated, dtype=torch.long, device=inputs.device)
+        # start generation
+        with torch.no_grad():
+            for _ in range(max_length):
+                outputs = self.decoder(input_ids=generated, layer_latent_vecs=latent_vecs, labels=None,
+                                       label_ignore=self.pad_token_id)
+                next_token_logits = outputs.logits[:, -1, :] / temperature
+                filtered_logits = top_k_top_p_filtering(next_token_logits, top_p=top_p, top_k=top_k)
+                log_probs = F.softmax(filtered_logits, dim=-1)
+                if repetition_penalty != 1.0:
+                    enforce_repetition_penalty(log_probs, generated, repetition_penalty)
+                next_token = torch.multinomial(log_probs, num_samples=1)
+                generated = torch.cat((generated, next_token), dim=1)
+                if all(next_token[idx, 0].item() == self.eos_token_id for idx in range(next_token.shape[0])):
+                    break  # if all samples predict eos in the batch.
+        return generated
+
+
+class DellaPretrainedModel(PreTrainedModel):
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        pass  # to bypass the not implement error
+
+
+class Della(DellaPretrainedModel):
+    '''This class is only implemented to suit huggingface interface, use vae_pl_module to initialize the VAE for training'''
+    config_class = DellaModelConfig
+    base_model_prefix = "della"
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config: DellaModelConfig):
+        super().__init__(config)
+        self.config = config
+        encoder_model = GPT2ForEncoderLatentConnector(config=self.config)
+        decoder_model = GPT2ForDecoderLatentConnector(config=self.config, latent_dim=self.config.latent_dim)
+        vae_model = DeepVAE(encoder_model, decoder_model, latent_dim=self.config.latent_dim,
+                            hidden_dim=self.config.hidden_size, layer_num=self.config.num_hidden_layers,
+                            pad_token_id=self.config.pad_token_id, bos_token_id=self.config.bos_token_id,
+                            eos_token_id=self.config.eos_token_id, CVAE=self.config.CVAE)
+        self.model = vae_model
+
+    def forward(self, inputs, cond_inputs=None, sample_latent=True):
+        # handle cond_inputs differently
+        enc_inputs = torch.concat((cond_inputs, inputs), dim=1) if self.model.CVAE else inputs
+        encoder_outputs = self.model.encoder(input_ids=enc_inputs)
+        # hidden_states are tuples with length layer_num+1 and each tensor has shape (batch_size, sequence_length, hidden_size), embedding layer is ignored
+        prior_z_list, posterior_z_list, prior_output_list, posterior_output_list = self.model.get_latent_vecs(
+            encoder_outputs.hidden_states[1:], cond_inputs=cond_inputs, sample=sample_latent)
+
+        loss_mask, dec_inputs = None, inputs
+        if self.model.CVAE:
+            loss_mask = torch.concat((torch.zeros_like(cond_inputs), torch.ones_like(inputs)), dim=1)
+            dec_inputs = torch.concat((cond_inputs, inputs), dim=1)
+        logits = self.model.decoder(input_ids=dec_inputs, layer_latent_vecs=posterior_z_list,
+                                    labels=dec_inputs, label_ignore=self.model.pad_token_id, loss_mask=loss_mask).logits
+
+        return DellaModelOutput(
+            logits=logits,
+            posterior_latents=posterior_z_list,
+            prior_latent=prior_z_list
+        )
diff --git a/fengshen/models/deepVAE/latent_connector.py b/fengshen/models/deepVAE/latent_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..509bd0a69a33a5e61d094a2e2943ed381459b87c
--- /dev/null
+++ b/fengshen/models/deepVAE/latent_connector.py
@@ -0,0 +1,410 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Della model. """
+
+import torch
+import logging
+import torch.nn as nn
+from dataclasses import dataclass
+from torch.nn import CrossEntropyLoss
+from typing import Optional, Tuple, Dict, Any
+# from transformers.utils.generic import ModelOutput
+from transformers.file_utils import ModelOutput
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from transformers.models.gpt2.modeling_gpt2 import GPT2PreTrainedModel, GPT2Block, GPT2Model
+
+
+@dataclass
+class DeepVAEDecoderOutput(ModelOutput):
+    logits: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+logger = logging.getLogger(__name__)
+
+
+class GPT2LatentDecoderModel(GPT2Model):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+
+    def __init__(self, config, latent_dim=32):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+        # DeepVAE addition
+        self.linear_emb_layers = nn.ModuleList([nn.Linear(latent_dim, config.hidden_size, bias=False) for i in range(config.num_hidden_layers)])
+        # self.linear_emb = nn.Linear(latent_dim, config.hidden_size, bias=False) # share the same latent vector as the embeddings
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        layer_latent_vecs=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+
+        # GPT2Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            # NOTE: deepVAE modification. update hidden_states before passing into gpt2block!
+            # hidden_states are with shape (batch_size, sequence_length, hidden_size)
+            # layer_latent_vecs are with shape (batch_size, hidden_size)
+            latent_repr = self.linear_emb_layers[i](layer_latent_vecs[i])
+            # latent_repr = self.linear_emb_layers[-1](layer_latent_vecs[-1])
+            # latent_repr = self.linear_emb(layer_latent_vecs[i])
+            hidden_states += latent_repr.unsqueeze(dim=1)
+
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = outputs[0]
+
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class GPT2ForDecoderLatentConnector(GPT2PreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import torch
+        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
+
+    """
+
+    def __init__(self, config, latent_dim=32):
+
+        super(GPT2ForDecoderLatentConnector, self).__init__(config)
+        self.transformer = GPT2LatentDecoderModel(config, latent_dim=latent_dim)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
+
+    def forward(self, input_ids, layer_latent_vecs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                labels=None, label_ignore=None, loss_mask=None, return_dict=False,
+                output_attentions=None, output_hidden_states=None, use_cache=None):
+
+        transformer_outputs = self.transformer(input_ids,
+                                               layer_latent_vecs,
+                                               past_key_values=past,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               head_mask=head_mask,
+                                               output_attentions=output_attentions,
+                                               output_hidden_states=output_hidden_states)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=label_ignore, reduction='none')
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+
+            if loss_mask is not None:
+                loss = loss.view(-1, shift_labels.shape[-1]) * loss_mask[:, :-1]
+                loss = torch.sum(loss, -1)
+            else:
+                loss = torch.sum(loss.view(-1, shift_labels.shape[-1]), -1)
+        else:
+            loss = None
+        outputs = DeepVAEDecoderOutput(loss=loss, logits=lm_logits, hidden_states=transformer_outputs.hidden_states,
+                                       attentions=transformer_outputs.attentions)
+        return outputs
+
+    def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -> Dict[str, Any]:
+        """
+        Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the generate method.
+        """
+        return {"input_ids": input_ids, "layer_latent_vecs": kwargs['layer_latent_vecs']}
+
+
+class GPT2ForEncoderLatentConnector(GPT2PreTrainedModel):
+
+    def __init__(self, config):
+
+        super(GPT2ForEncoderLatentConnector, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=True,
+        return_dict=None,
+    ):
+        # output hidden states must set to true to allow for layer-wise latent vars
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return transformer_outputs
diff --git a/fengshen/models/deepVAE/utils.py b/fengshen/models/deepVAE/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ffc0a407cf472a489d8fd0b893002cd55208db9
--- /dev/null
+++ b/fengshen/models/deepVAE/utils.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Della model. """
+
+import torch
+import torch.nn.functional as F
+from torch.distributions import Bernoulli
+
+
+def enforce_repetition_penalty(lprobs, prev_output_tokens, repetition_penalty=1.5):
+    """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+    for i in range(len(prev_output_tokens)):
+        for previous_token in set(prev_output_tokens[i]):
+            # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+            if lprobs[i, previous_token] < 0:
+                lprobs[i, previous_token] *= repetition_penalty
+            else:
+                lprobs[i, previous_token] /= repetition_penalty
+
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    # assert logits.dim() == 1# batch size 1 for now - could be updated for more but the code would be less clear
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        for i in range(sorted_indices.size()[0]):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        # logits[indices_to_remove] = filter_value
+    return logits
+
+
+def word_drop(x, p, unk_token):
+    x_ = x.detach().clone()
+    mask = Bernoulli(1. - p).sample(x.shape)
+    x_[mask == 0] = unk_token
+    return x_
+
+
+def log_sum_exp(value, dim=None, keepdim=False):
+    """Numerically stable implementation of the operation
+    value.exp().sum(dim, keepdim).log()
+    """
+    if dim is not None:
+        m, _ = torch.max(value, dim=dim, keepdim=True)
+        value0 = value - m
+        if keepdim is False:
+            m = m.squeeze(dim)
+        return m + torch.log(torch.sum(torch.exp(value0), dim=dim, keepdim=keepdim))
+    else:
+        m = torch.max(value)
+        sum_exp = torch.sum(torch.exp(value - m))
+        return m + torch.log(sum_exp)
+
+
+def connect(mean, logvar, nsamples=1, sample=True, clip=False, min_clip_val=-1., beta_logvar=1.):
+    """
+    Returns: Tensor1, Tensor2
+        Tensor1: the tensor latent z with shape [batch, nsamples, nz]
+    """
+    # (batch, nsamples, nz)
+    if sample:
+        if clip:
+            # NOTE: clip the logvar here to see if we can force z to be more distant
+            logvar = torch.clip(logvar, min=min_clip_val)
+        z = reparameterize(mean, logvar, nsamples, beta_logvar)
+    else:
+        batch_size, nz = mean.size()
+        z = mean.unsqueeze(1).expand(batch_size, nsamples, nz)
+    if nsamples == 1:
+        z = z.squeeze(dim=1)
+    return z
+
+
+def reparameterize(mu, logvar, nsamples=1, beta_logvar=1.):
+    """sample from posterior Gaussian family
+    Args:
+        mu: Tensor
+            Mean of gaussian distribution with shape (batch, nz)
+        logvar: Tensor
+            logvar of gaussian distibution with shape (batch, nz)
+    Returns: Tensor
+        Sampled z with shape (batch, nsamples, nz)
+    """
+    batch_size, nz = mu.size()
+    std = logvar.mul(0.5).exp().mul(beta_logvar)
+
+    mu_expd = mu.unsqueeze(1).expand(batch_size, nsamples, nz)
+    std_expd = std.unsqueeze(1).expand(batch_size, nsamples, nz)
+
+    eps = torch.zeros_like(std_expd).normal_()
+
+    return mu_expd + torch.mul(eps, std_expd)
+
+
+def compute_kl_loss(mean1, logvar1, mean2, logvar2):
+    '''adapted from adaVAE implementation https://github.com/ImKeTT/adavae/blob/main/src/adapters/vae.py#L1627'''
+    exponential = logvar1 - logvar2 - torch.pow(mean1 - mean2, 2) / logvar2.exp() - torch.exp(logvar1 - logvar2) + 1
+    result = -0.5 * torch.sum(exponential, tuple(range(1, len(exponential.shape))))
+    return result
diff --git a/fengshen/models/deltalm/configuration_deltalm.py b/fengshen/models/deltalm/configuration_deltalm.py
new file mode 100644
index 0000000000000000000000000000000000000000..97abb0b3e2c796b6be41d6bf22ae95a1b4557ef7
--- /dev/null
+++ b/fengshen/models/deltalm/configuration_deltalm.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+""" deltalm model configuration"""
+
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+
+BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "IDEA/Deltalm": "https://huggingface.co./Deltalm-362M-Zh-En/resolve/main/config.json",
+}
+
+
+class DeltalmConfig(PretrainedConfig):
+
+    model_type = "Deltalm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=250001,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=3072,
+        encoder_attention_heads=12,
+        decoder_layers=6,
+        decoder_ffn_dim=3072,
+        decoder_attention_heads=12,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=0,
+        forced_eos_token_id=2,
+        label_smoothing=0.1,
+        length_penalty=1.0,
+        encoder_normalize_before=False,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.label_smoothing = label_smoothing
+        self.encoder_normalize_before = encoder_normalize_before
+
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            length_penalty=length_penalty,
+            **kwargs,
+        )
+
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/fengshen/models/deltalm/modeling_deltalm.py b/fengshen/models/deltalm/modeling_deltalm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cdd65f3e106e9433dd5116419dfd50cd8a33b85
--- /dev/null
+++ b/fengshen/models/deltalm/modeling_deltalm.py
@@ -0,0 +1,1551 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+import copy
+import math
+import random
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from typing import List, Optional, Tuple, Union
+
+from transformers.modeling_utils import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqModelOutput,
+    Seq2SeqLMOutput,
+)
+from transformers.file_utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+
+import logging
+from .configuration_deltalm import DeltalmConfig
+logger = logging.getLogger(__name__)
+
+_CHECKPOINT_FOR_DOC = "IDEA-CCNL/Randeng-Deltalm-362M-En-Zn"
+_CONFIG_FOR_DOC = "DeltalmConfig"
+_TOKENIZER_FOR_DOC = "DeltalmTokenizer"
+
+# Base model docstring
+_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(float("-inf")))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class DeltalmLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Deltalm is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+
+
+class DeltalmAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class DeltalmEncoderLayer(nn.Module):
+    def __init__(self, config: DeltalmConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DeltalmAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DeltalmDecoderLayer(nn.Module):
+    def __init__(self, config: DeltalmConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = DeltalmAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DeltalmAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.fc3 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc4 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Add another ffn after self-attention to keep the structure same to encoder-layer
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc3(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc4(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class DeltalmPretrainedModel(PreTrainedModel):
+    config_class = DeltalmConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (DeltalmDecoder, DeltalmEncoder)):
+            module.gradient_checkpointing = value
+
+
+class DeltalmDecoder(DeltalmPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeltalmDecoderLayer`]
+    Args:
+        config: DeltalmConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: DeltalmConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = DeltalmLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([DeltalmDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # fairseq实现了一个 nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5) 对最后的output权重做正态分布转换？
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+DELTALM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`DeltalmConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DELTALM_GENERATION_EXAMPLE = r"""
+    Summarization example:
+    ```python
+    >>> from transformers import DeltalmTokenizer, DeltalmForConditionalGeneration
+    >>> model = DeltalmForConditionalGeneration.from_pretrained("facebook/deltalm-large-cnn")
+    >>> tokenizer = DeltalmTokenizer.from_pretrained("facebook/deltalm-large-cnn")
+    >>> ARTICLE_TO_SUMMARIZE = (
+    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+    ... )
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
+    >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
+    ```
+    Mask filling example:
+    ```python
+    >>> from transformers import DeltalmTokenizer, DeltalmForConditionalGeneration
+    >>> tokenizer = DeltalmTokenizer.from_pretrained("facebook/deltalm-base")
+    >>> model = DeltalmForConditionalGeneration.from_pretrained("facebook/deltalm-base")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
+    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+    >>> logits = model(input_ids).logits
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
+    >>> tokenizer.decode(predictions).split()
+    ['not', 'good', 'healthy', 'great', 'very']
+    ```
+"""
+
+DELTALM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            Deltalm uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            For translation and summarization training, `decoder_input_ids` should be provided. If no
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+            for denoising pre-training following the paper.
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            If you want to change padding behavior, you should read [`modeling_deltalm._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class DeltalmEncoder(DeltalmPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`DeltalmEncoderLayer`].
+    Args:
+        config: DeltalmConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: DeltalmConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = DeltalmLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([DeltalmEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        if config.encoder_normalize_before:
+            self.layer_norm = nn.LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if self.layer_norm is not None:
+            hidden_states = self.layer_norm(hidden_states)
+        # hidden_states = self.layernorm_embedding(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class DeltalmModel(DeltalmPretrainedModel):
+    def __init__(self, config: DeltalmConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = DeltalmEncoder(config, self.shared)
+        self.decoder = DeltalmDecoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(DELTALM_INPUTS_DOCSTRING)
+    # @add_code_sample_docstrings(
+    #     processor_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=Seq2SeqModelOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
+    # )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqModelOutput]:
+
+        # different to other models, Deltalm automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        logger.debug("last_hidden_state.size:  %s", decoder_outputs.last_hidden_state)
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The DELTALM Model with a language modeling head. Can be used for translation.", DELTALM_START_DOCSTRING
+)
+class DeltalmForConditionalGeneration(DeltalmPretrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"]
+
+    def __init__(self, config: DeltalmConfig):
+        super().__init__(config)
+        self.model = DeltalmModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        logger.debug("Debug: coming to _resize_final_logits_bias")
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(DELTALM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(DELTALM_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        logger.debug("Comming to Generation!")
+
+        if labels is not None:
+            logger.debug("Debug: *************** Before label ***************** ")
+            logger.debug("Debug: %s", labels.size())
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+            logger.debug("Debug: ************ After labels ************")
+            logger.debug("Debug: %s", labels.size())
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+        # print(self.lm_head)
+        logger.debug("Debug: logit_size: %s", lm_logits.size())
+
+        # logger.debug("Debug: change logit size: ", lm_logits.view(-1, self.config.vocab_size).size())
+        # logger.debug("Debug: change label size: ", labels.view(-1).size())
+        masked_lm_loss = None
+
+        if labels is not None:
+            # logger.debug("Debug: model label_size: %s", labels.size())
+            # loss_fct = CrossEntropyLoss()
+            # masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            # label_smoothing = self.config.label_smoothing
+            # # logger.debug("Debug: label.size: ", )
+            # if label_smoothing == 0:
+            #     # compute label smoothed loss
+            #     loss_fct = CrossEntropyLoss()
+            #     masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            # else:
+            #     m = torch.nn.LogSoftmax(dim=-1)
+            #     lprobs = m(lm_logits.float())
+            #     # lprobs = m(lm_logits)
+            #     # # torch.set_printoptions(linewidth=200)
+            #     loss_fn = label_smoothed_nll_loss
+            #     masked_lm_loss, _ = loss_fn(lprobs.view(-1, lprobs.size(-1)), labels.view(-1), label_smoothing, self.config.pad_token_id)
+
+        if not return_dict:
+            logger.debug("Debug: not return dict")
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+
+class DeltalmDecoderWrapper(DeltalmPretrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = DeltalmDecoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+class DeltalmForCausalLM(DeltalmPretrainedModel):
+    def __init__(self, config):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = DeltalmDecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import DeltalmTokenizer, DeltalmForCausalLM
+        >>> tokenizer = DeltalmTokenizer.from_pretrained("facebook/deltalm-base")
+        >>> model = DeltalmForCausalLM.from_pretrained("facebook/deltalm-base", add_cross_attention=False)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
+        >>> list(logits.shape) == expected_shape
+        True
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/fengshen/models/deltalm/tokenizer_deltalm.py b/fengshen/models/deltalm/tokenizer_deltalm.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc81acffeb15bea28c0dd5bb10287fc897cd55d
--- /dev/null
+++ b/fengshen/models/deltalm/tokenizer_deltalm.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import re
+import warnings
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"IDEA-CCNL/deltalm": "https://huggingface.co./IDEA-CCNL/Randeng-Deltalm-362M-En-Zn/resolve/main/spm.model"}
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "IDEA-CCNL/deltalm": 512,
+}
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeltalmTokenizer(PreTrainedTokenizer):
+    """
+    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (`int`, *optional*, defaults to 100):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
+            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
+            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
+            like in T5 preprocessing see
+            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=0,
+        additional_special_tokens=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs
+    ) -> None:
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+        elif extra_ids > 0 and additional_special_tokens is not None:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
+                    " tokens"
+                )
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            extra_ids=extra_ids,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+        self.offset = 1
+        self._extra_ids = extra_ids
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        self.encoder: Dict[int, str] = {
+            0: self.bos_token,
+            1: self.pad_token,
+            2: self.eos_token,
+            3: self.unk_token,
+        }
+
+        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
+
+    @staticmethod
+    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
+        if pretrained_model_name_or_path in DeltalmTokenizer.max_model_input_sizes:
+            deprecated_max_model_length = DeltalmTokenizer.max_model_input_sizes[pretrained_model_name_or_path]
+            if init_max_model_length is not None and init_max_model_length != max_model_length:
+                return init_max_model_length
+            elif init_max_model_length is None:
+                warnings.warn(
+                    "This tokenizer was incorrectly instantiated with a model max length of"
+                    f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this"
+                    " behavior is kept to avoid breaking backwards compatibility when padding/encoding with"
+                    " `truncation is True`.\n- Be aware that you SHOULD NOT rely on"
+                    f" {pretrained_model_name_or_path} automatically truncating your input to"
+                    f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences"
+                    f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with"
+                    " `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please"
+                    " instantiate this tokenizer with `model_max_length` set to your preferred value.",
+                    FutureWarning,
+                )
+
+        return max_model_length
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()  # + self._extra_ids
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
+                " eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A sequence has the following format:
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token.startswith("<extra_id_"):
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))
+            return self.vocab_size - num - 1
+        elif token in self.decoder:
+            return self.decoder[token]
+
+        sp_id = self.sp_model.piece_to_id(token)
+        return sp_id + self.offset
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        # if index < self.sp_model.get_piece_size():
+        #     token = self.sp_model.IdToPiece(index)
+        # else:
+        #     token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        # return token
+        if index in self.encoder:
+            return self.encoder[index]
+        elif index in self.added_tokens_encoder:
+            return self.added_tokens_encoder[index]
+        elif index < self.sp_model.get_piece_size() + 4:
+            token = self.sp_model.IdToPiece(index-self.offset)
+        else:
+            token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode_pieces(current_sub_tokens)
+        return out_string.strip()
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
diff --git a/fengshen/models/longformer/__init__.py b/fengshen/models/longformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c068ccdcd2a786128a6a90032fea2ff74d3ea0f
--- /dev/null
+++ b/fengshen/models/longformer/__init__.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_longformer": ["LongformerConfig"],
+    "tokenization_longformer": ["LongformerTokenizer"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_longformer"] = [
+        "LongformerModel",
+        "LongformerForMaskedLM",
+        "LongformerForMultipleChoice",
+        "LongformerPreTrainedModel",
+        "LongformerForQuestionAnswering",
+        "LongformerForSequenceClassification",
+        "LongformerForTokenClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_longformer import LongformerConfig
+    from .tokenization_longformer import LongformerTokenizer
+
+    if is_torch_available():
+        from .modeling_longformer import (
+            LongformerModel,
+            LongformerForMaskedLM,
+            LongformerForMultipleChoice,
+            LongformerPreTrainedModel,
+            LongformerForQuestionAnswering,
+            LongformerForSequenceClassification,
+            LongformerForTokenClassification,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/fengshen/models/longformer/configuration_longformer.py b/fengshen/models/longformer/configuration_longformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ad2b5557d4d0cd9d2397308b6a823c1789bb31
--- /dev/null
+++ b/fengshen/models/longformer/configuration_longformer.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import LongformerConfig
diff --git a/fengshen/models/longformer/modeling_longformer.py b/fengshen/models/longformer/modeling_longformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..697782a467a212926bba68e8a6791545f3c9f6e2
--- /dev/null
+++ b/fengshen/models/longformer/modeling_longformer.py
@@ -0,0 +1,2485 @@
+# coding=utf-8
+# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Longformer model. """
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from numpy.lib.function_base import kaiser
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN, gelu
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers import LongformerConfig
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096"
+_CONFIG_FOR_DOC = "LongformerConfig"
+_TOKENIZER_FOR_DOC = "LongformerTokenizer"
+
+LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "allenai/longformer-base-4096",
+    "allenai/longformer-large-4096",
+    "allenai/longformer-large-4096-finetuned-triviaqa",
+    "allenai/longformer-base-4096-extra.pos.embd.only",
+    "allenai/longformer-large-4096-extra.pos.embd.only",
+    # See all Longformer models at https://huggingface.co./models?filter=longformer
+]
+
+
+@dataclass
+class LongformerBaseModelOutput(ModelOutput):
+    """
+    Base class for Longformer's outputs, with potential hidden states, local and global attentions.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class LongformerBaseModelOutputWithPooling(ModelOutput):
+    """
+    Base class for Longformer's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
+            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
+            prediction (classification) objective during pretraining.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    pooler_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class LongformerMaskedLMOutput(ModelOutput):
+    """
+    Base class for masked language models outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Masked language modeling (MLM) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class LongformerQuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of question answering Longformer models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_logits: torch.FloatTensor = None
+    end_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class LongformerSequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class LongformerMultipleChoiceModelOutput(ModelOutput):
+    """
+    Base class for outputs of multiple choice Longformer models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class LongformerTokenClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x + attention_window + 1)`, where ``x`` is the number of tokens with global attention
+            mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first ``x`` values) and to every token in the attention window (remaining
+            ``attention_window + 1`` values). Note that the first ``x`` values refer to tokens with fixed positions in
+            the text, but the remaining ``attention_window + 1`` values refer to tokens with relative positions: the
+            attention weight of a token to itself is located at index ``x + attention_window / 2`` and the
+            ``attention_window / 2`` preceding (succeeding) values are the attention weights to the ``attention_window
+            / 2`` preceding (succeeding) tokens. If the attention window contains a token with global attention, the
+            attention weight at the corresponding index is set to 0; the value should be accessed from the first ``x``
+            attention weights. If a token has global attention, the attention weights to all other tokens in
+            :obj:`attentions` is set to 0, the values should be accessed from :obj:`global_attentions`.
+        global_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, x)`, where ``x`` is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def _get_question_end_index(input_ids, sep_token_id):
+    """
+    Computes the index of the first occurrence of `sep_token_id`.
+    """
+
+    sep_token_indices = (input_ids == sep_token_id).nonzero()
+    batch_size = input_ids.shape[0]
+
+    assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
+    assert (
+        sep_token_indices.shape[0] == 3 * batch_size
+    ), f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this error."
+    return sep_token_indices.view(batch_size, 3, 2)[:, 0, 1]
+
+
+def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
+    """
+    Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is
+    True` else after `sep_token_id`.
+    """
+    question_end_index = _get_question_end_index(input_ids, sep_token_id)
+    question_end_index = question_end_index.unsqueeze(
+        dim=1)  # size: batch_size x 1
+    # bool attention mask with True in locations of global attention
+    attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)
+    if before_sep_token is True:
+        attention_mask = (attention_mask.expand_as(input_ids)
+                          < question_end_index).to(torch.uint8)
+    else:
+        # last token is separation token and should not be counted and in the middle are two separation tokens
+        attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * (
+            attention_mask.expand_as(input_ids) < input_ids.shape[-1]
+        ).to(torch.uint8)
+
+    return attention_mask
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
+    return incremental_indices.long() + padding_idx
+
+
+class LongformerEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+
+        # Modify
+        # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        # self.padding_idx = config.pad_token_id
+        # self.position_embeddings = nn.Embedding(
+        #     config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        # )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+
+        # if position_ids is None:
+        #     if input_ids is not None:
+        #         # Create the position ids from the input token ids. Any padded tokens remain padded.
+        #         position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device)
+        #     else:
+        #         position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # if position_ids is None:
+        #     position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # Modify
+        # position_embeddings = self.position_embeddings(position_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor inputs_embeds:
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+class RoPEmbedding(nn.Module):
+    def __init__(self, d_model):
+        super(RoPEmbedding, self).__init__()
+        self.d_model = d_model
+        div_term = torch.exp(torch.arange(
+            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        self.register_buffer('div_term', div_term)
+
+    def forward(self, x, seq_dim=0):
+        x = x  # [seq_len,num_head,batch_size,per_head_hidden_size]
+        t = torch.arange(x.size(seq_dim), device=x.device).type_as(
+            self.div_term)
+        sinusoid_inp = torch.outer(t, self.div_term)
+        sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()  # [s, hn]
+        o_shape = (sin.size(0), 1, 1, sin.size(1))
+        sin, cos = sin.view(*o_shape), cos.view(*o_shape)  # [s, 1, 1, hn]
+        sin = torch.repeat_interleave(sin, 2, dim=-1)
+        cos = torch.repeat_interleave(cos, 2, dim=-1)
+        x2 = torch.stack([-x[..., 1::2], x[..., ::2]], dim=-1).reshape_as(x)
+        x = cos * x + sin * x2
+        return x
+
+
+class LongformerSelfAttention(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.config = config
+        self.num_heads = config.num_attention_heads
+        self.head_dim = int(config.hidden_size / config.num_attention_heads)
+        self.embed_dim = config.hidden_size
+
+        self.query = nn.Linear(config.hidden_size, self.embed_dim)
+        self.key = nn.Linear(config.hidden_size, self.embed_dim)
+        self.value = nn.Linear(config.hidden_size, self.embed_dim)
+
+        # separate projection layers for tokens with global attention
+        # self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
+        # self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
+        # self.value_global = nn.Linear(config.hidden_size, self.embed_dim)
+
+        self.dropout = config.attention_probs_dropout_prob
+
+        self.layer_id = layer_id
+        attention_window = config.attention_window[self.layer_id]
+        assert (
+            attention_window % 2 == 0
+        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        assert (
+            attention_window > 0
+        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+
+        self.one_sided_attn_window_size = attention_window // 2
+        self.rope_emb = RoPEmbedding(self.head_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_head_mask=None,
+        is_index_masked=None,
+        is_index_global_attn=None,
+        is_global_attn=None,
+        output_attentions=False,
+    ):
+        """
+        :class:`LongformerSelfAttention` expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
+        `attention_window` happens in :meth:`LongformerModel.forward` to avoid redoing the padding on each layer.
+
+        The `attention_mask` is changed in :meth:`LongformerModel.forward` from 0, 1, 2 to:
+
+            * -10000: no attention
+            * 0: local attention
+            * +10000: global attention
+        """
+
+        # print(attention_mask.shape)
+        if not self.config.use_sparse_attention:  # 如果不使用稀疏attention，则使用标准的attention
+            hidden_states = hidden_states.transpose(0, 1)
+            # project hidden states
+            query_vectors = self.query(hidden_states)
+            key_vectors = self.key(hidden_states)
+            value_vectors = self.value(hidden_states)
+
+            seq_len, batch_size, embed_dim = hidden_states.size()
+            assert (
+                embed_dim == self.embed_dim
+            ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+
+            # normalize query
+
+            # query_vectors = query_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+            # key_vectors = key_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+
+            # print('query_vectors',query_vectors.shape)
+
+            query_vectors = query_vectors.view(
+                seq_len, batch_size, self.num_heads, self.head_dim).transpose(1, 2)
+            key_vectors = key_vectors.view(
+                seq_len, batch_size, self.num_heads, self.head_dim).transpose(1, 2)
+
+            query_vectors = self.rope_emb(query_vectors)
+            key_vectors = self.rope_emb(key_vectors)
+
+            query_vectors = query_vectors.transpose(0, 2)  # [b,mh,s,hd]
+            key_vectors = key_vectors.transpose(0, 2).transpose(2, 3)
+
+            # print('query_vectors',query_vectors.shape)
+
+            query_vectors /= math.sqrt(self.head_dim)
+
+            attention_mask = self.get_extended_attention_mask(
+                attention_mask, attention_mask.shape, attention_mask.device)
+            attn_scores = torch.matmul(
+                query_vectors, key_vectors)+attention_mask
+
+            attn_scores = torch.nn.functional.softmax(attn_scores, dim=-1)
+
+            value_vectors = value_vectors.view(
+                seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            outputs = torch.matmul(attn_scores, value_vectors).transpose(
+                1, 2).contiguous().view(batch_size, seq_len, self.num_heads*self.head_dim)
+
+            # print('output',outputs.shape)
+            outputs = (outputs,)
+            return outputs+(attn_scores,)
+
+        # print('hidden.shape',hidden_states.shape)
+        # print('attention_mask.shape',attention_mask.shape)
+        # print('att_mask:',attention_mask)
+
+        hidden_states = hidden_states.transpose(0, 1)
+
+        # project hidden states
+        query_vectors = self.query(hidden_states)
+        key_vectors = self.key(hidden_states)
+        value_vectors = self.value(hidden_states)
+
+        seq_len, batch_size, embed_dim = hidden_states.size()
+        assert (
+            embed_dim == self.embed_dim
+        ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+
+        # normalize query
+
+        # query_vectors = query_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+        # key_vectors = key_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+
+        query_vectors = query_vectors.view(
+            seq_len, batch_size, self.num_heads, self.head_dim).transpose(1, 2)
+        key_vectors = key_vectors.view(
+            seq_len, batch_size, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_vectors = self.rope_emb(query_vectors)
+        key_vectors = self.rope_emb(key_vectors)
+
+        query_vectors = query_vectors.transpose(1, 2).transpose(0, 1)
+        key_vectors = key_vectors.transpose(1, 2).transpose(0, 1)
+
+        query_vectors /= math.sqrt(self.head_dim)
+
+        attn_scores = self._sliding_chunks_query_key_matmul(
+            query_vectors, key_vectors, self.one_sided_attn_window_size
+        )
+        # print('att:',attn_scores.shape)
+        # values to pad for attention probs
+        remove_from_windowed_attention_mask = (
+            attention_mask != 0)[:, :, None, None]
+
+        # cast to fp32/fp16 then replace 1's with -inf
+        float_mask = remove_from_windowed_attention_mask.type_as(query_vectors).masked_fill(
+            remove_from_windowed_attention_mask, -10000.0
+        )
+        # diagonal mask with zeros everywhere and -inf inplace of padding
+        diagonal_mask = self._sliding_chunks_query_key_matmul(
+            float_mask.new_ones(size=float_mask.size()
+                                ), float_mask, self.one_sided_attn_window_size
+        )
+
+        # pad local attention probs
+        attn_scores += diagonal_mask
+
+        assert list(attn_scores.size()) == [
+            batch_size,
+            seq_len,
+            self.num_heads,
+            self.one_sided_attn_window_size * 2 + 1,
+        ], f"local_attn_probs should be of size ({batch_size}, {seq_len}, {self.num_heads}, {self.one_sided_attn_window_size * 2 + 1}), but is of size {attn_scores.size()}"
+
+        # compute local attention probs from global attention keys and contact over window dim
+        if is_global_attn:
+            # compute global attn indices required through out forward fn
+            (
+                max_num_global_attn_indices,
+                is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero,
+            ) = self._get_global_attn_indices(is_index_global_attn)
+            # calculate global attn probs from global key
+
+            global_key_attn_scores = self._concat_with_global_key_attn_probs(
+                query_vectors=query_vectors,
+                key_vectors=key_vectors,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
+            )
+            # concat to local_attn_probs
+            # (batch_size, seq_len, num_heads, extra attention count + 2*window+1)
+            attn_scores = torch.cat(
+                (global_key_attn_scores, attn_scores), dim=-1)
+
+            # free memory
+            del global_key_attn_scores
+
+        attn_probs = nn.functional.softmax(
+            attn_scores, dim=-1, dtype=torch.float32
+        )  # use fp32 for numerical stability
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (
+                self.num_heads,
+            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
+
+        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
+        attn_probs = torch.masked_fill(
+            attn_probs, is_index_masked[:, :, None, None], 0.0)
+        attn_probs = attn_probs.type_as(attn_scores)
+
+        # free memory
+        del attn_scores
+
+        # apply dropout
+        attn_probs = nn.functional.dropout(
+            attn_probs, p=self.dropout, training=self.training)
+
+        value_vectors = value_vectors.view(
+            seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+
+        # compute local attention output with global attention value and add
+        if is_global_attn:
+            # compute sum of global and local attn
+            attn_output = self._compute_attn_output_with_global_indices(
+                value_vectors=value_vectors,
+                attn_probs=attn_probs,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+            )
+        else:
+            # compute local attn only
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
+                attn_probs, value_vectors, self.one_sided_attn_window_size
+            )
+
+        assert attn_output.size() == (batch_size, seq_len, self.num_heads,
+                                      self.head_dim), "Unexpected size"
+        attn_output = attn_output.transpose(0, 1).reshape(
+            seq_len, batch_size, embed_dim).contiguous()
+
+        # compute value for global attention and overwrite to attention output
+        # TODO: remove the redundant computation
+        if is_global_attn:
+            global_attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
+                global_query_vectors=query_vectors,
+                global_key_vectors=key_vectors,
+                global_value_vectors=value_vectors,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                layer_head_mask=layer_head_mask,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
+                is_index_masked=is_index_masked,
+            )
+            # print('global_attn_output',global_attn_output.shape)
+            # get only non zero global attn output
+            nonzero_global_attn_output = global_attn_output[
+                is_local_index_global_attn_nonzero[0], :, is_local_index_global_attn_nonzero[1]
+            ]
+            # print('nonzero_global_attn_output',nonzero_global_attn_output.shape)
+            # overwrite values with global attention
+            attn_output[is_index_global_attn_nonzero[::-1]] = nonzero_global_attn_output.view(
+                len(is_local_index_global_attn_nonzero[0]), -1
+            )
+            # The attention weights for tokens with global attention are
+            # just filler values, they were never used to compute the output.
+            # Fill with 0 now, the correct values are in 'global_attn_probs'.
+            attn_probs[is_index_global_attn_nonzero] = 0
+
+        outputs = (attn_output.transpose(0, 1),)
+
+        if output_attentions:
+            outputs += (attn_probs,)
+
+        return outputs + (global_attn_probs,) if (is_global_attn and output_attentions) else outputs
+
+    @staticmethod
+    def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
+        """pads rows and then flips rows and columns"""
+        hidden_states_padded = nn.functional.pad(
+            hidden_states_padded, padding
+        )  # padding value is not important because it will be overwritten
+        hidden_states_padded = hidden_states_padded.view(
+            *hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2)
+        )
+        return hidden_states_padded
+
+    @staticmethod
+    def _pad_and_diagonalize(chunked_hidden_states):
+        """
+        shift every row 1 step right, converting columns into diagonals.
+
+        Example::
+
+              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                       -1.8348,  0.7672,  0.2986,  0.0285,
+                                       -0.7584,  0.4206, -0.0405,  0.1599,
+                                       2.0514, -1.1600,  0.5372,  0.2629 ]
+              window_overlap = num_rows = 4
+             (pad & diagonalize) =>
+             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        """
+        total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
+        chunked_hidden_states = nn.functional.pad(
+            chunked_hidden_states, (0, window_overlap + 1)
+        )  # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
+        chunked_hidden_states = chunked_hidden_states.view(
+            total_num_heads, num_chunks, -1
+        )  # total_num_heads x num_chunks x window_overlap*window_overlap+window_overlap
+        chunked_hidden_states = chunked_hidden_states[
+            :, :, :-window_overlap
+        ]  # total_num_heads x num_chunks x window_overlap*window_overlap
+        chunked_hidden_states = chunked_hidden_states.view(
+            total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
+        )
+        chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
+        return chunked_hidden_states
+
+    @staticmethod
+    def _chunk(hidden_states, window_overlap):
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
+
+        # non-overlapping chunks of size = 2w
+        hidden_states = hidden_states.view(
+            hidden_states.size(0),
+            hidden_states.size(1) // (window_overlap * 2),
+            window_overlap * 2,
+            hidden_states.size(2),
+        )
+
+        # use `as_strided` to make the chunks overlap with an overlap size = window_overlap
+        chunk_size = list(hidden_states.size())
+        chunk_size[1] = chunk_size[1] * 2 - 1
+
+        chunk_stride = list(hidden_states.stride())
+        chunk_stride[1] = chunk_stride[1] // 2
+        return hidden_states.as_strided(size=chunk_size, stride=chunk_stride)
+
+    @staticmethod
+    def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor:
+        beginning_mask_2d = input_tensor.new_ones(
+            affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0])
+        beginning_mask = beginning_mask_2d[None, :, None, :]
+        ending_mask = beginning_mask.flip(dims=(1, 3))
+        beginning_input = input_tensor[:,
+                                       :affected_seq_len, :, : affected_seq_len + 1]
+        beginning_mask = beginning_mask.expand(beginning_input.size())
+        # `== 1` converts to bool or uint8
+        beginning_input.masked_fill_(beginning_mask == 1, -float("inf"))
+        ending_input = input_tensor[:, -
+                                    affected_seq_len:, :, -(affected_seq_len + 1):]
+        ending_mask = ending_mask.expand(ending_input.size())
+        # `== 1` converts to bool or uint8
+        ending_input.masked_fill_(ending_mask == 1, -float("inf"))
+
+    def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tensor, window_overlap: int):
+        """
+        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
+        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an
+        overlap of size window_overlap
+        """
+        batch_size, seq_len, num_heads, head_dim = query.size()
+        assert (
+            seq_len % (window_overlap * 2) == 0
+        ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        assert query.size() == key.size()
+
+        chunks_count = seq_len // window_overlap - 1
+
+        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2
+        query = query.transpose(1, 2).reshape(
+            batch_size * num_heads, seq_len, head_dim)
+        key = key.transpose(1, 2).reshape(
+            batch_size * num_heads, seq_len, head_dim)
+
+        query = self._chunk(query, window_overlap)
+        key = self._chunk(key, window_overlap)
+
+        # matrix multiplication
+        # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
+        # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
+        # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
+        diagonal_chunked_attention_scores = torch.einsum(
+            "bcxd,bcyd->bcxy", (query, key))  # multiply
+
+        # convert diagonals into columns
+        diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(
+            diagonal_chunked_attention_scores, padding=(0, 0, 0, 1)
+        )
+
+        # allocate space for the overall attention matrix where the chunks are combined. The last dimension
+        # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to
+        # window_overlap previous words). The following column is attention score from each word to itself, then
+        # followed by window_overlap columns for the upper triangle.
+
+        diagonal_attention_scores = diagonal_chunked_attention_scores.new_empty(
+            (batch_size * num_heads, chunks_count + 1,
+             window_overlap, window_overlap * 2 + 1)
+        )
+
+        # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions
+        # - copying the main diagonal and the upper triangle
+        diagonal_attention_scores[:, :-1, :, window_overlap:] = diagonal_chunked_attention_scores[
+            :, :, :window_overlap, : window_overlap + 1
+        ]
+        diagonal_attention_scores[:, -1, :, window_overlap:] = diagonal_chunked_attention_scores[
+            :, -1, window_overlap:, : window_overlap + 1
+        ]
+        # - copying the lower triangle
+        diagonal_attention_scores[:, 1:, :, :window_overlap] = diagonal_chunked_attention_scores[
+            :, :, -(window_overlap + 1): -1, window_overlap + 1:
+        ]
+
+        diagonal_attention_scores[:, 0, 1:window_overlap, 1:window_overlap] = diagonal_chunked_attention_scores[
+            :, 0, : window_overlap - 1, 1 - window_overlap:
+        ]
+
+        # separate batch_size and num_heads dimensions again
+        diagonal_attention_scores = diagonal_attention_scores.view(
+            batch_size, num_heads, seq_len, 2 * window_overlap + 1
+        ).transpose(2, 1)
+
+        self._mask_invalid_locations(diagonal_attention_scores, window_overlap)
+        return diagonal_attention_scores
+
+    def _sliding_chunks_matmul_attn_probs_value(
+        self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
+    ):
+        """
+        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
+        same shape as `attn_probs`
+        """
+        batch_size, seq_len, num_heads, head_dim = value.size()
+
+        assert seq_len % (window_overlap * 2) == 0
+        assert attn_probs.size()[:3] == value.size()[:3]
+        assert attn_probs.size(3) == 2 * window_overlap + 1
+        chunks_count = seq_len // window_overlap - 1
+        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap
+
+        chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
+            batch_size * num_heads, seq_len // window_overlap, window_overlap, 2 * window_overlap + 1
+        )
+
+        # group batch_size and num_heads dimensions into one
+        value = value.transpose(1, 2).reshape(
+            batch_size * num_heads, seq_len, head_dim)
+
+        # pad seq_len with w at the beginning of the sequence and another window overlap at the end
+        padded_value = nn.functional.pad(
+            value, (0, 0, window_overlap, window_overlap), value=-1)
+
+        # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
+        chunked_value_size = (batch_size * num_heads,
+                              chunks_count + 1, 3 * window_overlap, head_dim)
+        chunked_value_stride = padded_value.stride()
+        chunked_value_stride = (
+            chunked_value_stride[0],
+            window_overlap * chunked_value_stride[1],
+            chunked_value_stride[1],
+            chunked_value_stride[2],
+        )
+        chunked_value = padded_value.as_strided(
+            size=chunked_value_size, stride=chunked_value_stride)
+
+        chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
+
+        context = torch.einsum(
+            "bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))
+        return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2)
+
+    @staticmethod
+    def _get_global_attn_indices(is_index_global_attn):
+        """compute global attn indices required throughout forward pass"""
+        # helper variable
+        num_global_attn_indices = is_index_global_attn.long().sum(dim=1)
+
+        # max number of global attn indices in batch
+        max_num_global_attn_indices = num_global_attn_indices.max()
+
+        # indices of global attn
+        is_index_global_attn_nonzero = is_index_global_attn.nonzero(
+            as_tuple=True)
+
+        # helper variable
+        is_local_index_global_attn = torch.arange(
+            max_num_global_attn_indices, device=is_index_global_attn.device
+        ) < num_global_attn_indices.unsqueeze(dim=-1)
+
+        # location of the non-padding values within global attention indices
+        is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(
+            as_tuple=True)
+
+        # location of the padding values within global attention indices
+        is_local_index_no_global_attn_nonzero = (
+            is_local_index_global_attn == 0).nonzero(as_tuple=True)
+        return (
+            max_num_global_attn_indices,
+            is_index_global_attn_nonzero,
+            is_local_index_global_attn_nonzero,
+            is_local_index_no_global_attn_nonzero,
+        )
+
+    def _concat_with_global_key_attn_probs(
+        self,
+        key_vectors,
+        query_vectors,
+        max_num_global_attn_indices,
+        is_index_global_attn_nonzero,
+        is_local_index_global_attn_nonzero,
+        is_local_index_no_global_attn_nonzero,
+    ):
+        batch_size = key_vectors.shape[0]
+
+        # create only global key vectors
+        key_vectors_only_global = key_vectors.new_zeros(
+            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
+        )
+
+        key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero]
+
+        # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
+        attn_probs_from_global_key = torch.einsum(
+            "blhd,bshd->blhs", (query_vectors, key_vectors_only_global))
+
+        attn_probs_from_global_key[
+            is_local_index_no_global_attn_nonzero[0], :, :, is_local_index_no_global_attn_nonzero[1]
+        ] = -10000.0
+
+        return attn_probs_from_global_key
+
+    def _compute_attn_output_with_global_indices(
+        self,
+        value_vectors,
+        attn_probs,
+        max_num_global_attn_indices,
+        is_index_global_attn_nonzero,
+        is_local_index_global_attn_nonzero,
+    ):
+        batch_size = attn_probs.shape[0]
+
+        # cut local attn probs to global only
+        attn_probs_only_global = attn_probs.narrow(
+            -1, 0, max_num_global_attn_indices)
+        # get value vectors for global only
+        value_vectors_only_global = value_vectors.new_zeros(
+            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
+        )
+        value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero]
+
+        # use `matmul` because `einsum` crashes sometimes with fp16
+        # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v))
+        # compute attn output only global
+        attn_output_only_global = torch.matmul(
+            attn_probs_only_global.transpose(
+                1, 2), value_vectors_only_global.transpose(1, 2)
+        ).transpose(1, 2)
+
+        # reshape attn probs
+        attn_probs_without_global = attn_probs.narrow(
+            -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices
+        ).contiguous()
+
+        # compute attn output with global
+        attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
+            attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
+        )
+        return attn_output_only_global + attn_output_without_global
+
+    def _compute_global_attn_output_from_hidden(
+        self,
+        global_query_vectors,
+        global_key_vectors,
+        global_value_vectors,
+        max_num_global_attn_indices,
+        layer_head_mask,
+        is_local_index_global_attn_nonzero,
+        is_index_global_attn_nonzero,
+        is_local_index_no_global_attn_nonzero,
+        is_index_masked,
+    ):
+
+        global_query_vectors = global_query_vectors.transpose(0, 1)
+        seq_len, batch_size, _, _ = global_query_vectors.shape
+        global_query_vectors_only_global = global_query_vectors.new_zeros(
+            max_num_global_attn_indices, batch_size, self.num_heads, self.head_dim)
+        global_query_vectors_only_global[is_local_index_global_attn_nonzero[::-1]] = global_query_vectors[
+            is_index_global_attn_nonzero[::-1]
+        ]
+
+        seq_len_q, batch_size_q, _, _ = global_query_vectors_only_global.shape
+
+        # print('global_query_vectors_only_global',global_query_vectors_only_global.shape)
+
+        global_query_vectors_only_global = global_query_vectors_only_global.view(
+            seq_len_q, batch_size_q, self.num_heads, self.head_dim)
+        global_key_vectors = global_key_vectors.transpose(0, 1)
+        global_value_vectors = global_value_vectors.transpose(0, 1)
+
+        # reshape
+        global_query_vectors_only_global = (
+            global_query_vectors_only_global.contiguous()
+            .view(max_num_global_attn_indices, batch_size * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )  # (batch_size * self.num_heads, max_num_global_attn_indices, head_dim)
+        global_key_vectors = (
+            global_key_vectors.contiguous().view(-1, batch_size * self.num_heads,
+                                                 self.head_dim).transpose(0, 1)
+        )  # batch_size * self.num_heads, seq_len, head_dim)
+        global_value_vectors = (
+            global_value_vectors.contiguous().view(-1, batch_size * self.num_heads,
+                                                   self.head_dim).transpose(0, 1)
+        )  # batch_size * self.num_heads, seq_len, head_dim)
+
+        # compute attn scores
+
+        global_attn_scores = torch.bmm(
+            global_query_vectors_only_global, global_key_vectors.transpose(1, 2))
+
+        assert list(global_attn_scores.size()) == [
+            batch_size * self.num_heads,
+            max_num_global_attn_indices,
+            seq_len,
+        ], f"global_attn_scores have the wrong size. Size should be {(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)}, but is {global_attn_scores.size()}."
+
+        global_attn_scores = global_attn_scores.view(
+            batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
+
+        global_attn_scores[
+            is_local_index_no_global_attn_nonzero[0], :, is_local_index_no_global_attn_nonzero[1], :
+        ] = -10000.0
+
+        global_attn_scores = global_attn_scores.masked_fill(
+            is_index_masked[:, None, None, :],
+            -10000.0,
+        )
+
+        global_attn_scores = global_attn_scores.view(
+            batch_size * self.num_heads, max_num_global_attn_indices, seq_len)
+
+        # compute global attn probs
+        global_attn_probs_float = nn.functional.softmax(
+            global_attn_scores, dim=-1, dtype=torch.float32
+        )  # use fp32 for numerical stability
+
+        # apply layer head masking
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (
+                self.num_heads,
+            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
+                batch_size, self.num_heads, max_num_global_attn_indices, seq_len
+            )
+            global_attn_probs_float = global_attn_probs_float.view(
+                batch_size * self.num_heads, max_num_global_attn_indices, seq_len
+            )
+
+        global_attn_probs = nn.functional.dropout(
+            global_attn_probs_float.type_as(global_attn_scores), p=self.dropout, training=self.training
+        )
+
+        # global attn output
+        global_attn_output = torch.bmm(global_attn_probs, global_value_vectors)
+
+        assert list(global_attn_output.size()) == [
+            batch_size * self.num_heads,
+            max_num_global_attn_indices,
+            self.head_dim,
+        ], f"global_attn_output tensor has the wrong size. Size should be {(batch_size * self.num_heads, max_num_global_attn_indices, self.head_dim)}, but is {global_attn_output.size()}."
+
+        global_attn_probs = global_attn_probs.view(
+            batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
+        global_attn_output = global_attn_output.view(
+            batch_size, self.num_heads, max_num_global_attn_indices, self.head_dim
+        )
+        return global_attn_output, global_attn_probs
+
+    def get_extended_attention_mask(self, attention_mask, input_shape, device):
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+
+        ones = torch.ones_like(attention_mask)
+        zero = torch.zeros_like(attention_mask)
+        attention_mask = torch.where(attention_mask < 0, zero, ones)
+
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        # extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class LongformerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LongformerAttention(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.self = LongformerSelfAttention(config, layer_id)
+        self.output = LongformerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - \
+            len(heads)
+        self.self.all_head_size = self.self.attention_head_size * \
+            self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_head_mask=None,
+        is_index_masked=None,
+        is_index_global_attn=None,
+        is_global_attn=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+            output_attentions=output_attentions,
+        )
+        attn_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attn_output,) + self_outputs[1:]
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class LongformerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class LongformerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LongformerLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.attention = LongformerAttention(config, layer_id)
+        self.intermediate = LongformerIntermediate(config)
+        self.output = LongformerOutput(config)
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_head_mask=None,
+        is_index_masked=None,
+        is_index_global_attn=None,
+        is_global_attn=None,
+        output_attentions=False,
+    ):
+        self_attn_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+            output_attentions=output_attentions,
+        )
+        attn_output = self_attn_outputs[0]
+        outputs = self_attn_outputs[1:]
+
+        layer_output = apply_chunking_to_forward(
+            self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attn_output
+        )
+        outputs = (layer_output,) + outputs
+        return outputs
+
+    def ff_chunk(self, attn_output):
+        intermediate_output = self.intermediate(attn_output)
+        layer_output = self.output(intermediate_output, attn_output)
+        return layer_output
+
+
+class LongformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [LongformerLayer(config, layer_id=i) for i in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+
+        is_index_masked = attention_mask < 0
+        is_index_global_attn = attention_mask > 0
+        is_global_attn = is_index_global_attn.flatten().any().item()
+
+        all_hidden_states = () if output_hidden_states else None
+        # All local attentions.
+        all_attentions = () if output_attentions else None
+        all_global_attentions = () if (output_attentions and is_global_attn) else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layer)
+            ), f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
+        for idx, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, is_global_attn, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    is_index_masked,
+                    is_index_global_attn,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    layer_head_mask=head_mask[idx] if head_mask is not None else None,
+                    is_index_masked=is_index_masked,
+                    is_index_global_attn=is_index_global_attn,
+                    is_global_attn=is_global_attn,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                # bzs x seq_len x num_attn_heads x (num_global_attn + attention_window_len + 1) => bzs x num_attn_heads x seq_len x (num_global_attn + attention_window_len + 1)
+                all_attentions = all_attentions + \
+                    (layer_outputs[1].transpose(1, 2),)
+
+                if is_global_attn:
+                    # bzs x num_attn_heads x num_global_attn x seq_len => bzs x num_attn_heads x seq_len x num_global_attn
+                    all_global_attentions = all_global_attentions + \
+                        (layer_outputs[2].transpose(2, 3),)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_global_attentions] if v is not None
+            )
+        return LongformerBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            global_attentions=all_global_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class LongformerPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Longformer
+class LongformerLMHead(nn.Module):
+    """Longformer Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        self.bias = self.decoder.bias
+
+
+class LongformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LongformerConfig
+    base_model_prefix = "longformer"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+LONGFORMER_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.LongformerConfig`): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+LONGFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.LongformerTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to decide the attention given on each token, local attention or global attention. Tokens with global
+            attention attends to all other tokens, and all other tokens attend to them. This is important for
+            task-specific finetuning because it makes the model more flexible at representing the task. For example,
+            for classification, the <s> token should be given global attention. For QA, all question tokens should also
+            have global attention. Please refer to the `Longformer paper <https://arxiv.org/abs/2004.05150>`__ for more
+            details. Mask values selected in ``[0, 1]``:
+
+            - 0 for local attention (a sliding window attention),
+            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+
+        head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Longformer Model outputting raw hidden-states without any specific head on top.",
+    LONGFORMER_START_DOCSTRING,
+)
+class LongformerModel(LongformerPreTrainedModel):
+    """
+    This class copied code from :class:`~transformers.RobertaModel` and overwrote standard self-attention with
+    longformer self-attention to provide the ability to process long sequences following the self-attention approach
+    described in `Longformer: the Long-Document Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy,
+    Matthew E. Peters, and Arman Cohan. Longformer self-attention combines a local (sliding window) and global
+    attention to extend to long documents without the O(n^2) increase in memory and compute.
+
+    The self-attention module :obj:`LongformerSelfAttention` implemented here supports the combination of local and
+    global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and
+    dilated attention are more relevant for autoregressive language modeling than finetuning on downstream tasks.
+    Future release will add support for autoregressive attention, but the support for dilated attention requires a
+    custom CUDA kernel to be memory and compute efficient.
+
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        if isinstance(config.attention_window, int):
+            assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
+            assert config.attention_window > 0, "`config.attention_window` has to be positive"
+            config.attention_window = [
+                config.attention_window] * config.num_hidden_layers  # one value per layer
+        else:
+            assert len(config.attention_window) == config.num_hidden_layers, (
+                "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
+                f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
+            )
+
+        self.embeddings = LongformerEmbeddings(config)
+        self.encoder = LongformerEncoder(config)
+        self.pooler = LongformerPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def _pad_to_window_size(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        token_type_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        pad_token_id: int,
+    ):
+        """A helper function to pad tokens and mask to work with implementation of Longformer self-attention."""
+        # padding
+        attention_window = (
+            self.config.attention_window
+            if isinstance(self.config.attention_window, int)
+            else max(self.config.attention_window)
+        )
+
+        assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
+        input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
+        batch_size, seq_len = input_shape[:2]
+
+        padding_len = (attention_window - seq_len %
+                       attention_window) % attention_window
+        if padding_len > 0:
+            logger.info(
+                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
+                f"`config.attention_window`: {attention_window}"
+            )
+            if input_ids is not None:
+                input_ids = nn.functional.pad(
+                    input_ids, (0, padding_len), value=pad_token_id)
+            if position_ids is not None:
+                # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings
+                position_ids = nn.functional.pad(
+                    position_ids, (0, padding_len), value=pad_token_id)
+            if inputs_embeds is not None:
+                input_ids_padding = inputs_embeds.new_full(
+                    (batch_size, padding_len),
+                    self.config.pad_token_id,
+                    dtype=torch.long,
+                )
+                inputs_embeds_padding = self.embeddings(input_ids_padding)
+                inputs_embeds = torch.cat(
+                    [inputs_embeds, inputs_embeds_padding], dim=-2)
+
+            attention_mask = nn.functional.pad(
+                attention_mask, (0, padding_len), value=False
+            )  # no attention on the padding tokens
+            token_type_ids = nn.functional.pad(
+                token_type_ids, (0, padding_len), value=0)  # pad with token_type_id = 0
+
+        return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds
+
+    def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
+        # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
+        # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
+        # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
+        if attention_mask is not None:
+            attention_mask = attention_mask * (global_attention_mask + 1)
+        else:
+            # simply use `global_attention_mask` as `attention_mask`
+            # if no `attention_mask` is given
+            attention_mask = global_attention_mask + 1
+        return attention_mask
+
+    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=LongformerBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+
+        Returns:
+
+        Examples::
+
+            >>> import torch
+            >>> from transformers import LongformerModel, LongformerTokenizer
+
+            >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
+            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+
+            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+
+            >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+            >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
+            >>> global_attention_mask[:, [1, 4, 21,]] = 1  # Set global attention to random tokens for the sake of this example
+            ...                                     # Usually, set global attention based on the task. For example,
+            ...                                     # classification: the <s> token
+            ...                                     # QA: question tokens
+            ...                                     # LM: potentially on the beginning of sentences and paragraphs
+            >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
+            >>> sequence_output = outputs.last_hidden_state
+            >>> pooled_output = outputs.pooler_output
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # merge `global_attention_mask` and `attention_mask`
+        if global_attention_mask is not None:
+            attention_mask = self._merge_to_attention_mask(
+                attention_mask, global_attention_mask)
+
+        if self.config.use_sparse_attention:
+            padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = self._pad_to_window_size(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                pad_token_id=self.config.pad_token_id,
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)[
+            :, 0, 0, :
+        ]
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        # undo padding
+        if self.config.use_sparse_attention:
+            if padding_len > 0:
+                # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1)
+                sequence_output = sequence_output[:, :-padding_len]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return LongformerBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            global_attentions=encoder_outputs.global_attentions,
+        )
+
+
+@add_start_docstrings("""Longformer Model with a `language modeling` head on top. """, LONGFORMER_START_DOCSTRING)
+class LongformerForMaskedLM(LongformerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.longformer = LongformerModel(config, add_pooling_layer=False)
+        self.lm_head = LongformerLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=LongformerMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Examples::
+
+            >>> import torch
+            >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
+
+            >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
+            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+
+            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+
+            >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
+            ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
+            >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+            >>> loss = outputs.loss
+            >>> prediction_logits = output.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.longformer(
+            input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return LongformerMaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            global_attentions=outputs.global_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    LONGFORMER_START_DOCSTRING,
+)
+class LongformerForSequenceClassification(LongformerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.longformer = LongformerModel(config, add_pooling_layer=False)
+        self.classifier = LongformerClassificationHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=LongformerSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if global_attention_mask is None:
+            logger.info("Initializing global attention on CLS token...")
+            global_attention_mask = torch.zeros_like(input_ids)
+            # global attention on cls token
+            global_attention_mask[:, 0] = 1
+
+        outputs = self.longformer(
+            input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return LongformerSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            global_attentions=outputs.global_attentions,
+        )
+
+
+class LongformerClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, hidden_states, **kwargs):
+        # take <s> token (equiv. to [CLS])
+        hidden_states = hidden_states[:, 0, :]
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        output = self.out_proj(hidden_states)
+        return output
+
+
+@add_start_docstrings(
+    """
+    Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD /
+    TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    LONGFORMER_START_DOCSTRING,
+)
+class LongformerForQuestionAnswering(LongformerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.longformer = LongformerModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+            >>> import torch
+
+            >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+            >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+
+            >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+            >>> encoding = tokenizer(question, text, return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]
+
+            >>> # default is local attention everywhere
+            >>> # the forward method will automatically set global attention on question tokens
+            >>> attention_mask = encoding["attention_mask"]
+
+            >>> outputs = model(input_ids, attention_mask=attention_mask)
+            >>> start_logits = outputs.start_logits
+            >>> end_logits = outputs.end_logits
+            >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+
+            >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
+            >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if global_attention_mask is None:
+            if input_ids is None:
+                logger.warning(
+                    "It is not possible to automatically generate the `global_attention_mask` because input_ids is None. Please make sure that it is correctly set."
+                )
+            else:
+                # set global attention on question tokens automatically
+                global_attention_mask = _compute_global_attention_mask(
+                    input_ids, self.config.sep_token_id)
+
+        outputs = self.longformer(
+            input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return LongformerQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            global_attentions=outputs.global_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    LONGFORMER_START_DOCSTRING,
+)
+class LongformerForTokenClassification(LongformerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.longformer = LongformerModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=LongformerTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.longformer(
+            input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(
+                        loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return LongformerTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            global_attentions=outputs.global_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    LONGFORMER_START_DOCSTRING,
+)
+class LongformerForMultipleChoice(LongformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.longformer = LongformerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        LONGFORMER_INPUTS_DOCSTRING.format(
+            "batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=LongformerMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        labels=None,
+        position_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # set global attention on question tokens
+        if global_attention_mask is None and input_ids is not None:
+            logger.info("Initializing global attention on multiple choice...")
+            # put global attention on all tokens after `config.sep_token_id`
+            global_attention_mask = torch.stack(
+                [
+                    _compute_global_attention_mask(
+                        input_ids[:, i], self.config.sep_token_id, before_sep_token=False)
+                    for i in range(num_choices)
+                ],
+                dim=1,
+            )
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)
+                                        ) if input_ids is not None else None
+        flat_position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(
+            -1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(
+            -1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_global_attention_mask = (
+            global_attention_mask.view(-1, global_attention_mask.size(-1))
+            if global_attention_mask is not None
+            else None
+        )
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.longformer(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            global_attention_mask=flat_global_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return LongformerMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            global_attentions=outputs.global_attentions,
+        )
diff --git a/fengshen/models/longformer/tokenization_longformer.py b/fengshen/models/longformer/tokenization_longformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b3452ec7545e39b9ef1de276cf1fe8111a35fa
--- /dev/null
+++ b/fengshen/models/longformer/tokenization_longformer.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import BertTokenizer as LongformerTokenizer
diff --git a/fengshen/models/megatron_t5/__init__.py b/fengshen/models/megatron_t5/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84f78136331c5ef4975697bc6a77910bba7429bd
--- /dev/null
+++ b/fengshen/models/megatron_t5/__init__.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_megatron_t5": ["T5Config"],
+    "tokenization_megatron_t5": ["T5Tokenizer"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_megatron_t5"] = [
+        "T5Model",
+        "T5EncoderModel",
+        "T5ForConditionalGeneration"
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_megatron_t5 import T5Config
+    from .tokenization_megatron_t5 import T5Tokenizer
+
+    if is_torch_available():
+        from .modeling_megatron_t5 import (
+            T5Model,
+            T5EncoderModel,
+            T5ForConditionalGeneration
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure)
diff --git a/fengshen/models/megatron_t5/configuration_megatron_t5.py b/fengshen/models/megatron_t5/configuration_megatron_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b960e947cfd162d79d6b017fb77e30707c4c2e
--- /dev/null
+++ b/fengshen/models/megatron_t5/configuration_megatron_t5.py
@@ -0,0 +1,255 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration """
+from collections import OrderedDict
+from typing import Any, Dict, Iterable, Mapping, Optional
+
+from transformers import PreTrainedTokenizer, TensorType
+
+from transformers import is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfigWithPast
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "T5-small": "https://huggingface.co./T5-small/resolve/main/config.json",
+    "T5-base": "https://huggingface.co./T5-base/resolve/main/config.json",
+    "T5-large": "https://huggingface.co./T5-large/resolve/main/config.json",
+    "T5-3b": "https://huggingface.co./T5-3b/resolve/main/config.json",
+    "T5-11b": "https://huggingface.co./T5-11b/resolve/main/config.json",
+}
+
+
+class T5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.T5Model` or a
+    :class:`~transformers.TFT5Model`. It is used to instantiate a T5 model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the T5 `T5-small <https://huggingface.co./T5-small>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Arguments:
+        vocab_size (:obj:`int`, `optional`, defaults to 32128):
+            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
+        d_model (:obj:`int`, `optional`, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (:obj:`int`, `optional`, defaults to 64):
+            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
+            // num_heads`.
+        d_ff (:obj:`int`, `optional`, defaults to 2048):
+            Size of the intermediate feed forward layer in each :obj:`T5Block`.
+        num_layers (:obj:`int`, `optional`, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (:obj:`int`, `optional`):
+            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+            set.
+        num_heads (:obj:`int`, `optional`, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+            The number of buckets to use for each attention layer.
+        dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"relu"`):
+            Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`. T5v1.1 uses
+            the :obj:`"gated-gelu"` feed forward projection. Original T5 uses :obj:`"relu"`.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+    """
+    model_type = "T5"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32128,
+        d_model=512,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=6,
+        num_decoder_layers=None,
+        num_heads=8,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_factor=1.0,
+        feed_forward_proj="gelu",
+        is_encoder_decoder=True,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        gradient_checkpointing=False,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+        self.gradient_checkpointing = gradient_checkpointing
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers
+
+
+class T5OnnxConfig(OnnxConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ("decoder_input_ids", {0: "batch"}),
+                ("decoder_attention_mask", {0: "batch"}),
+            ]
+        )
+
+        if self.use_past:
+            for i in range(0, self._config.num_layers):
+                common_inputs[f"past_key_values.{i}.decoder.key"] = {
+                    0: "batch", 2: "past_sequence"}
+                common_inputs[f"past_key_values.{i}.decoder.value"] = {
+                    0: "batch", 2: "past_sequence"}
+                common_inputs[f"past_key_values.{i}.encoder.key"] = {
+                    0: "batch", 2: "past_sequence"}
+                common_inputs[f"past_key_values.{i}.encoder.value"] = {
+                    0: "batch", 2: "past_sequence"}
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_outputs = super().outputs
+
+        if "last_hidden_state" in common_outputs:
+            common_outputs["last_hidden_state"] = {
+                0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            for i in range(self._config.num_layers):
+                common_outputs[f"present.{i}.decoder.key"] = {
+                    0: "batch", 2: "decoder_sequence"}
+                common_outputs[f"present.{i}.decoder.value"] = {
+                    0: "batch", 2: "decoder_sequence"}
+                common_outputs[f"present.{i}.encoder.key"] = {
+                    0: "batch", 2: "encoder_sequence"}
+                common_outputs[f"present.{i}.encoder.value"] = {
+                    0: "batch", 2: "encoder_sequence"}
+
+        if self.task == "default":
+            common_outputs["encoder_last_hidden_state"] = {
+                0: "batch", 2: "encoder_sequence"}
+
+        return common_outputs
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+
+        # Generate encoder inputs
+        encoder_inputs = super().generate_dummy_inputs(
+            tokenizer, batch_size, seq_length, is_pair, framework)
+
+        # Generate decoder inputs
+        decoder_inputs = super().generate_dummy_inputs(
+            tokenizer, batch_size, 1, is_pair, framework)
+        decoder_inputs = {f"decoder_{name}": tensor for name,
+                          tensor in decoder_inputs.items()}
+
+        ordered_inputs = dict(**encoder_inputs, **decoder_inputs)
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError(
+                    "Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch = encoder_inputs["input_ids"].shape[0]
+            encoder_seq_length = encoder_inputs["input_ids"].shape[1]
+            encoder_shape = (
+                batch,
+                self._config.num_heads,
+                encoder_seq_length,
+                self._config.hidden_size // self._config.num_heads,
+            )
+            decoder_shape = (batch, self._config.num_heads, 1,
+                             self._config.hidden_size // self._config.num_heads)
+
+            ordered_inputs["past_key_values"] = []
+            for _ in range(self._config.num_layers):
+                ordered_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+
+        return ordered_inputs
+
+    @staticmethod
+    def flatten_output_collection_property(name: str, field: Iterable[Any]) -> Dict[str, Any]:
+        if name in ["present", "past_key_values"]:
+            flatten_output = {}
+            for idx, t in enumerate(field):
+                flatten_output[f"{name}.{idx}.decoder.key"] = t[0]
+                flatten_output[f"{name}.{idx}.decoder.value"] = t[1]
+                flatten_output[f"{name}.{idx}.encoder.key"] = t[2]
+                flatten_output[f"{name}.{idx}.encoder.value"] = t[3]
+
+            return flatten_output
+
+        return super().flatten_output_collection_property(name, field)
diff --git a/fengshen/models/megatron_t5/modeling_megatron_t5.py b/fengshen/models/megatron_t5/modeling_megatron_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ad4fb8126b9a4c0b0bb7debed95b999b5cf097
--- /dev/null
+++ b/fengshen/models/megatron_t5/modeling_megatron_t5.py
@@ -0,0 +1,2086 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model. """
+
+
+import copy
+import math
+import os
+import warnings
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import logging
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_megatron_t5 import T5Config
+import numpy as np
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "T5Config"
+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+_CHECKPOINT_FOR_DOC = "T5-small"
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "T5-small",
+    "T5-base",
+    "T5-large",
+    "T5-3b",
+    "T5-11b",
+    # See all T5 models at https://huggingface.co./models?filter=T5
+]
+
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+
+def load_tf_weights_in_T5(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer",
+                  "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        if "_slot_" in name[-1]:
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] in ["kernel", "scale", "embedding"]:
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "self_attention":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[0]
+            elif scope_names[0] == "enc_dec_attention":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[1]
+            elif scope_names[0] == "dense_relu_dense":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[2]
+            elif scope_names[0] == "rms_norm":
+                if hasattr(pointer, "layer_norm"):
+                    pointer = getattr(pointer, "layer_norm")
+                elif hasattr(pointer, "final_layer_norm"):
+                    pointer = getattr(pointer, "final_layer_norm")
+            elif scope_names[0] == "scale":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            elif scope_names[0] == "decoder" and name[1] == "logits":
+                continue
+            elif scope_names[0] == "logits":
+                pointer = getattr(pointer, "lm_head")
+            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, f"wi_{scope_names[1]}")
+                continue
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if scope_names[0] not in ["kernel", "scale", "embedding"]:
+            pointer = getattr(pointer, "weight")
+        if scope_names[0] != "embedding":
+            logger.info(
+                f"Transposing numpy weight of shape {array.shape} for {name}")
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info(
+        f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
+####################################################
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the T5 models have the
+            following number of attention modules:
+
+                - T5-small: 6
+                - T5-base: 12
+                - T5-large: 24
+                - T5-3b: 24
+                - T5-11b: 24
+
+    Example::
+
+    # Here is an example of a device map on a machine with 4 GPUs using T5-3b,
+    # which has a total of 24 attention modules:
+            model = T5ForConditionalGeneration.from_pretrained('T5-3b')
+            device_map = {0: [0, 1, 2],
+
+                         1: [3, 4, 5, 6, 7, 8, 9],
+                         2: [10, 11, 12, 13, 14, 15, 16],
+                         3: [17, 18, 19, 20, 21, 22, 23]}
+            model.parallelize(device_map)
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example::
+
+        # On a 4 GPU machine with T5-3b:
+        model = T5ForConditionalGeneration.from_pretrained('T5-3b')
+        device_map = {0: [0, 1, 2],
+
+                     1: [3, 4, 5, 6, 7, 8, 9],
+                     2: [10, 11, 12, 13, 14, 15, 16],
+                     3: [17, 18, 19, 20, 21, 22, 23]}
+        model.parallelize(device_map) # Splits the model across several devices
+        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+"""
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # layer norm should always be calculated in float32
+        variance = hidden_states.to(torch.float32).pow(
+            2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * \
+            torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into float16 if necessary
+        if self.weight.dtype == torch.float16:
+            hidden_states = hidden_states.to(torch.float16)
+        return self.weight * hidden_states
+
+
+class T5DenseReluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # @IDEA modified -> bias=False -> bias=True
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=True)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=True)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = nn.functional.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # @IDEA modified -> bias=False -> bias=True
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=True)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=True)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = nn.functional.gelu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # @IDEA modified -> bias=False -> bias=True
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=True)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=True)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=True)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # @IDEA modified -> T5LayerNorm -> nn.LayerNorm
+        # self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = nn.LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        if config.feed_forward_proj == "relu":
+            self.DenseReluDense = T5DenseReluDense(config)
+        elif config.feed_forward_proj == "gelu":
+            self.DenseReluDense = T5DenseGeluDense(config)
+        else:
+            raise ValueError(
+                f"{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`"
+            )
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        # @IDEA modified -> bias=False -> bias=True
+
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=True)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=True)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=True)
+
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=True)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(
+                self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position >
+                                 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = - \
+                torch.min(relative_position,
+                          torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(
+                relative_postion_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small,
+                                        relative_position, relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(
+            query_length, dtype=torch.long, device=self.relative_attention_bias.weight.device
+        )[:, None]
+        memory_position = torch.arange(
+            key_length, dtype=torch.long, device=self.relative_attention_bias.weight.device
+        )[None, :]
+        relative_position = memory_position - \
+            context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+        )
+        # shape (query_length, key_length, num_heads)
+        values = self.relative_attention_bias(relative_position_bucket)
+        # shape (1, num_heads, query_length, key_length)
+        values = values.permute([2, 0, 1]).unsqueeze(0)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[
+            1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat(
+                        [past_key_value, hidden_states], dim=2)
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        # (batch_size, n_heads, seq_length, dim_per_head)
+        query_states = shape(self.q(hidden_states))
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states, past_key_value[
+                0] if past_key_value is not None else None
+        )
+        value_states = project(
+            hidden_states, self.v, key_value_states, past_key_value[
+                1] if past_key_value is not None else None
+        )
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1):, :]
+
+            if mask is not None:
+                # (batch_size, n_heads, seq_length, key_length)
+                position_bias = position_bias + mask
+
+        # @IDEA modified -> delete scores += position_bias, use absolute positional
+        # scores += position_bias
+        scores = scores / math.sqrt(self.key_value_proj_dim)
+
+        if mask is not None:
+            scores = scores + mask
+
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=0, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        # (batch_size, seq_length, dim)
+        attn_output = unshape(torch.matmul(attn_weights, value_states))
+
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states, value_states) if (
+            self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + \
+            (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+
+        # @IDEA modified -> T5LayerNorm -> nn.LayerNorm
+        # self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = nn.LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.SelfAttention = T5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias)
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        # add attentions if we output them
+        outputs = (hidden_states,) + attention_output[1:]
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # @IDEA modified -> T5LayerNorm -> nn.LayerNorm
+        # self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = nn.LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+
+        self.EncDecAttention = T5Attention(
+            config, has_relative_attention_bias=False)
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        # add attentions if we output them
+        outputs = (layer_output,) + attention_output[1:]
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        # @IDEA modified ->
+        # self.layer = nn.ModuleList()
+        # self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        # if self.is_decoder:
+        #     self.layer.append(T5LayerCrossAttention(config))
+
+        # self.layer.append(T5LayerFF(config))
+
+        self.T5LayerSelfAttention = T5LayerSelfAttention(
+            config, has_relative_attention_bias=has_relative_attention_bias)
+        if self.is_decoder:
+            self.T5LayerCrossAttention = T5LayerCrossAttention(
+                config)
+        self.T5LayerFF = T5LayerFF(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            assert self.is_decoder, "Only decoder can use `past_key_values`"
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        # @IDEA modified -> self.layer[0] -> self.T5LayerSelfAttention
+        self_attention_outputs = self.T5LayerSelfAttention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        # Keep self-attention outputs and relative position weights
+        attention_outputs = self_attention_outputs[2:]
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+            # @IDEA modified -> self.layer[1] -> self.T5LayerCrossAttention
+            cross_attention_outputs = self.T5LayerCrossAttention(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(
+                    hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + \
+                    cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        # @IDEA modified -> self.layer[-1] -> self.T5LayerFF
+        hidden_states = self.T5LayerFF(hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        # hidden-states, present_key_value_states, (self-attention position bias),
+        # (self-attention weights), (cross-attention position bias), (cross-attention weights)
+        return outputs
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = T5Config
+    load_tf_weights = load_tf_weights_in_T5
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d
+            # /mesh_tensorflow/layers.py#L1624
+            # @IDEA modified -> module.shared.weight -> module.shared.word_embeddings.weight
+            # module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            module.shared.word_embeddings.weight.data.normal_(
+                mean=0.0, std=factor * 1.0)
+            module.shared.position_embeddings.weight.data.normal_(
+                mean=0.0, std=factor * 1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow
+            # /transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/
+            # mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGeluDense):
+            module.wi_0.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d
+            # /mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(
+                mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(
+                mean=0.0, std=factor * (d_model ** -0.5))
+            module.v.weight.data.normal_(
+                mean=0.0, std=factor * (d_model ** -0.5))
+
+            module.o.weight.data.normal_(
+                mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(
+                    mean=0.0, std=factor * ((d_model) ** -0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (T5Attention, T5Stack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), "self.model.config.decoder_start_token_id has to be defined. "\
+            "In T5 it is usually set to the pad_token_id. See T5 docs for more information"
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(
+                input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(
+        ), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class T5Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+
+        # In Megatron, layer-norm is applied after the 1st dropout.
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(
+            config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute")
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length: seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        # Megatron BERT moves that layer norm after the drop-out (and to each layer).
+        # embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class T5Stack(T5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+
+        # @IDEA modified -> has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)
+        # -> has_relative_attention_bias=False
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=False)
+             for _ in range(config.num_layers)]
+        )
+        # @IDEA modified -> T5LayerNorm -> nn.LayerNorm
+        # self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.final_layer_norm = nn.LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.init_weights()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(
+                torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + \
+            str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = "cuda:" + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+
+        # Set embed_tokens to first layer
+
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        self.embeddings = self.embeddings.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to("cpu")
+        self.embed_tokens = self.embed_tokens.to("cpu")
+        self.final_layer_norm = self.final_layer_norm.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            # @IDEA modified -> self.embed_tokens(input_ids=input_ids) ->
+            # self.embed_tokens(input_ids=input_ids,osition_ids=position_ids,)
+            # inputs_embeds = self.embed_tokens(input_ids=input_ids)
+            inputs_embeds = self.embed_tokens(input_ids=input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[2] + \
+            seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, f":obj:`use_cache` can only be set to `True` if {self} is used as a decoder"
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                batch_size, mask_seq_length).to(inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, inputs_embeds.device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (
+                encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(
+            cross_attn_head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(
+                        hidden_states.device)
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                        hidden_states.device)
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(
+                        hidden_states.device)
+                if layer_head_mask is not None:
+                    layer_head_mask = layer_head_mask.to(hidden_states.device)
+                if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(
+                        hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return tuple(module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + \
+                    (present_key_value_state,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + \
+                        (layer_outputs[5],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+T5_START_DOCSTRING = r"""
+
+    The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
+    <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text
+    denoising generative setting.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+T5_INPUTS_DOCSTRING = """
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            detail.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
+            <./T5.html#training>`__.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+
+            T5 uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
+            To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
+            <./T5.html#training>`__.
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape
+        :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in ``[0,
+            1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or
+        :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in ``[0,
+            1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_heads,)` or
+        :obj:`(num_layers, num_heads)`, `optional`):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, :obj:`optional`: `hidden_states`, :obj:`optional`:
+            `attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a
+            sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having
+        4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)
+        `, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+T5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            detail.
+
+            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
+            <./T5.html#training>`__.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5LMHead(nn.Module):
+    """Masked LM head for T5
+
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        init_method: init method for weight initialization
+        layernorm_epsilon: tolerance for layer norm divisions
+        parallel_output: wether output logits being distributed or not.
+    """
+
+    def __init__(self, config):
+        super(T5LMHead, self).__init__()
+
+        self.bias = torch.nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        output = torch.nn.functional.linear(hidden_states,
+                                            word_embeddings_weight,
+                                            bias=self.bias)
+        return output
+
+
+class T5Model(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"encoder\.embed_tokens\.weight",
+        r"decoder\.embed_tokens\.weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        # @IDEA modified -> nn.Embedding -> T5Embeddings
+        # self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.shared = T5Embeddings(config)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.encoder.block),
+                           range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import T5Tokenizer, T5Model
+
+            >>> tokenizer = T5Tokenizer.from_pretrained('T5-small')
+            >>> model = T5Model.from_pretrained('T5-small')
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you",
+            return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+            >>> # forward pass
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(
+                    encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(
+                    encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"encoder\.embed_tokens\.weight",
+        r"decoder\.embed_tokens\.weight",
+        r"lm_head\.weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        # @IDEA modified -> nn.Embedding -> T5Embeddings
+        # self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.shared = T5Embeddings(config)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        # @IDEA modified -> add self.lm_head_bias
+        self.lm_head_bias = torch.nn.Parameter(torch.zeros(config.vocab_size))
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.encoder.block),
+                           range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head_bias
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def generate(self, input_ids=None, max_length=512):
+
+        input_ids = torch.tensor(input_ids)
+        if len(input_ids.shape) < 2:
+            input_ids = input_ids.unsqueeze(0)
+        decode_input_id = [21128]   # [BOS]的token_id为21128
+        for i in range(max_length):
+            tensor_decode_input_id = torch.tensor([decode_input_id])
+            forword_output = self.forward(input_ids=input_ids,
+                                          decoder_input_ids=tensor_decode_input_id)
+            logits = forword_output.logits
+            logits = torch.nn.functional.softmax(
+                logits, dim=-1).cpu().detach().numpy()[0]
+
+            last_output_id = int(np.random.choice(
+                logits.shape[1], p=logits[-1]))
+            if last_output_id == 21129:  # [EOS]的token_id为21129
+                break
+            else:
+                decode_input_id.append(last_output_id)
+
+        return decode_input_id
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
+            labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+        Examples::
+
+            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+            >>> tokenizer = T5Tokenizer.from_pretrained('T5-small')
+            >>> model = T5ForConditionalGeneration.from_pretrained('T5-small')
+
+            >>> # training
+            >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+            >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+            >>> outputs = model(input_ids=input_ids, labels=labels)
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
+
+            >>> # inference
+            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you",
+            return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model.generate(input_ids)
+            >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+            >>> # studies have shown that owning a dog is good for you.
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(
+                    encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(
+                    encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs.last_hidden_state
+
+        # Set device for model parallelism
+        # if self.model_parallel:
+        #     torch.cuda.set_device(self.encoder.first_device)
+        #     self.lm_head = self.lm_head.to(self.encoder.first_device)
+        #     sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        # if self.config.tie_word_embeddings:
+        #     # Rescale output before projecting on vocab
+        #     # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/
+        #       mesh_tensorflow/transformer/transformer.py#L586
+        #     sequence_output = sequence_output * (self.model_dim ** -0.5)
+
+        lm_logits = torch.nn.functional.linear(
+            sequence_output, self.shared.word_embeddings.weight, bias=self.lm_head_bias)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # @IDEA modified(thom): Add z_loss https://github.com/tensorflow/mesh/blob/
+            # fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning(
+                "You might want to consider setting `use_cache=True` to speed up decoding")
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)),
+                )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + \
+                (reordered_layer_past_states,)
+        return reordered_decoder_past
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5EncoderModel(T5PreTrainedModel):
+    authorized_missing_keys = [
+        r"encoder\.embed_tokens\.weight",
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        # @IDEA modified -> nn.Embedding -> T5Embeddings
+        # self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.shared = T5Embeddings(config)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.encoder.block),
+                           range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import T5Tokenizer, T5EncoderModel
+            >>> tokenizer = T5Tokenizer.from_pretrained('T5-small')
+            >>> model = T5EncoderModel.from_pretrained('T5-small')
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you",
+        return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids)
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
diff --git a/fengshen/models/megatron_t5/tokenization_megatron_t5.py b/fengshen/models/megatron_t5/tokenization_megatron_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..d96b7e1743ae8c7ecb4aa3871907a9dc070cf74b
--- /dev/null
+++ b/fengshen/models/megatron_t5/tokenization_megatron_t5.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5Tokenizer """
+
+from transformers import BertTokenizer
+
+
+class T5Tokenizer():
+    def __init__(self, extra_id_num=118):
+        self.extra_id_num = extra_id_num
+
+    @classmethod
+    def from_pretrained(self, vocab_path):
+        self.extra_id_num = 118
+        self.T5_special_tokens = ['[BOS]', '[EOS]']
+        for i in range(self.extra_id_num):
+            self.T5_special_tokens.append(f'<extra_id_{str(i)}>')
+        tokenizer = BertTokenizer.from_pretrained(vocab_path, additional_special_tokens=self.T5_special_tokens)
+
+        return tokenizer
diff --git a/fengshen/models/model_utils.py b/fengshen/models/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..65699c45b660e17e05d116a04ae68911acea4b35
--- /dev/null
+++ b/fengshen/models/model_utils.py
@@ -0,0 +1,251 @@
+from pytorch_lightning import LightningModule
+from pytorch_lightning.strategies import DeepSpeedStrategy
+from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from transformers.optimization import AdamW, TYPE_TO_SCHEDULER_FUNCTION
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers.trainer_utils import SchedulerType
+from typing import Optional, Union
+import warnings
+import types
+
+
+def add_module_args(parent_args):
+    parser = parent_args.add_argument_group('Basic Module')
+    parser.add_argument('--learning_rate', default=5e-5, type=float)
+    parser.add_argument('--min_learning_rate', default=1e-7, type=float)
+    parser.add_argument('--lr_decay_steps', default=0, type=int)
+    # lr decay的时候会依赖total_steps，这里设置的是total_steps的比例，比如我只需要前50%步做decay，ratio设置为0.5
+    parser.add_argument('--lr_decay_ratio', default=1.0, type=float)
+    parser.add_argument('--warmup_steps', default=0, type=int)
+    parser.add_argument('--warmup_ratio', default=0.1, type=float)
+    parser.add_argument('--weight_decay', default=1e-1, type=float)
+    parser.add_argument('--adam_beta1', default=0.9, type=float)
+    parser.add_argument('--adam_beta2', default=0.999, type=float)
+    parser.add_argument('--adam_epsilon', default=1e-8, type=float)
+    parser.add_argument('--model_path', default=None, type=str)
+    parser.add_argument('--scheduler_type', default='polynomial', type=str)
+    return parent_args
+
+
+def add_inverse_square_args(parent_args):
+    parser = parent_args.add_argument_group('Basic Module')
+    parser.add_argument('--warmup_min_lr', default=1e-9, type=float)
+    parser.add_argument('--warmup_max_lr', default=1e-4, type=float)
+
+    return parent_args
+
+
+def get_default_update_params(pl_model: LightningModule):
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.', 'layernorm.']
+    optimizer_grouped_params = [
+        {'params': [p for n, p in pl_model.named_parameters() if not any(
+            nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': pl_model.hparams.weight_decay},
+        {'params': [p for n, p in pl_model.named_parameters() if any(
+            nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
+    ]
+    return optimizer_grouped_params
+
+
+def configure_optimizers(pl_model: LightningModule, model_params=None):
+    '''
+    Args:
+        pl_model： lightning module
+        model_params: 需要优化的模型参数
+    '''
+    # get params that optimizer need
+    if model_params is None:
+        optimizer_grouped_params = get_default_update_params(pl_model)
+    else:
+        optimizer_grouped_params = model_params
+    # Configure optimizer.
+    if isinstance(pl_model.trainer.strategy, DeepSpeedStrategy):
+        if 'offload_optimizer' in pl_model.trainer.strategy.config['zero_optimization']:
+            optimizer = DeepSpeedCPUAdam(
+                optimizer_grouped_params, adamw_mode=True,
+                lr=pl_model.hparams.learning_rate,
+                betas=(pl_model.hparams.adam_beta1, pl_model.hparams.adam_beta2), eps=pl_model.hparams.adam_epsilon)
+        else:
+            optimizer = FusedAdam(
+                optimizer_grouped_params, adam_w_mode=True,
+                lr=pl_model.hparams.learning_rate,
+                betas=(pl_model.hparams.adam_beta1, pl_model.hparams.adam_beta2), eps=pl_model.hparams.adam_epsilon)
+    # elif isinstance(pl_model.trainer.strategy, ColossalAIStrategy):
+    #     from colossalai.nn.optimizer import HybridAdam
+    #     optimizer = HybridAdam(
+    #         optimizer_grouped_params,
+    #         lr=pl_model.hparams.learning_rate,
+    #         betas=(pl_model.hparams.adam_beta1, pl_model.hparams.adam_beta2),
+    #         eps=pl_model.hparams.adam_epsilon)
+    else:
+        optimizer = AdamW(optimizer_grouped_params, lr=pl_model.hparams.learning_rate,
+                          betas=(pl_model.hparams.adam_beta1, pl_model.hparams.adam_beta2),
+                          eps=pl_model.hparams.adam_epsilon)
+    # Configure learning rate scheduler.
+
+    warmup_steps = pl_model.hparams.warmup_ratio * \
+        pl_model.total_steps if pl_model.hparams.warmup_steps == 0 else pl_model.hparams.warmup_steps
+
+    if pl_model.hparams.scheduler_type == "inverse_sqrt":
+        scheduler = inverse_square_root_schedule(optimizer=optimizer,
+                                                 num_warmup_steps=warmup_steps, lr_min=pl_model.hparams.warmup_min_lr, lr_max=pl_model.hparams.warmup_max_lr)
+    else:
+        total_steps = pl_model.hparams.lr_decay_ratio * \
+            pl_model.total_steps if pl_model.hparams.lr_decay_steps == 0 else pl_model.hparams.lr_decay_steps
+        scheduler = get_scheduler(name=pl_model.hparams.scheduler_type, optimizer=optimizer,
+                                  num_warmup_steps=warmup_steps, num_training_steps=total_steps,
+                                  lr_end=pl_model.hparams.min_learning_rate)
+    scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
+    return [optimizer], [scheduler]
+
+
+def inverse_square_root_schedule(
+        optimizer: Optimizer,
+        num_warmup_steps: int = 4000,
+        lr_min=1e-9,
+        lr_max=1e-4,
+        power=0.5,
+        last_epoch: int = -1):
+
+    lr_init = optimizer.defaults["lr"]
+    if (lr_min > lr_max):
+        raise ValueError(f"lr_min ({lr_min}) must be be smaller than lr_max ({lr_max})")
+
+    lr_step = (lr_max - lr_init) / num_warmup_steps
+    decay_factor = lr_max * num_warmup_steps**power
+
+    def lr_lambda(current_step: int):
+        # 自定义函数
+        if current_step < num_warmup_steps:
+            return lr_step * current_step
+        return decay_factor * current_step ** (-power)
+
+    return Direct_LR(optimizer, lr_lambda, last_epoch, True)
+
+
+class Direct_LR(_LRScheduler):
+    """
+    Modified from LambdaLR
+    """
+
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1, warmup_steps=4000, verbose=False):
+        self.optimizer = optimizer
+        self.warmup_steps = warmup_steps
+        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            if len(lr_lambda) != len(optimizer.param_groups):
+                raise ValueError("Expected {} lr_lambdas, but got {}".format(
+                    len(optimizer.param_groups), len(lr_lambda)))
+            self.lr_lambdas = list(lr_lambda)
+        super(Direct_LR, self).__init__(optimizer, last_epoch, verbose)
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
+        """
+
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
+        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
+
+        for idx, fn in enumerate(self.lr_lambdas):
+            if not isinstance(fn, types.FunctionType):
+                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+
+        lr_lambdas = state_dict.pop('lr_lambdas')
+        self.__dict__.update(state_dict)
+        # Restore state_dict keys in order to prevent side effects
+        # https://github.com/pytorch/pytorch/issues/32756
+        state_dict['lr_lambdas'] = lr_lambdas
+
+        for idx, fn in enumerate(lr_lambdas):
+            if fn is not None:
+                self.lr_lambdas[idx].__dict__.update(fn)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.")
+
+        if self._step_count < self.warmup_steps:
+            return [base_lr + lmbda(self.last_epoch)
+                    for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
+
+        return [lmbda(self.last_epoch) for lmbda in self.lr_lambdas]
+
+
+def get_total_steps(trainer, hparams):
+    train_loader = trainer._data_connector._train_dataloader_source.dataloader()
+    # Calculate total steps
+    if trainer.max_epochs > 0:
+        world_size = trainer.world_size
+        tb_size = hparams.train_batchsize * max(1, world_size)
+        ab_size = trainer.accumulate_grad_batches
+        total_steps = (len(train_loader.dataset) *
+                       trainer.max_epochs // tb_size) // ab_size
+    else:
+        total_steps = trainer.max_steps
+    return total_steps
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    lr_end: Optional[float] = None
+):
+    """
+    Unified API to get any scheduler from its name.
+
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer)
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    if name == SchedulerType.POLYNOMIAL:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps,
+                             num_training_steps=num_training_steps, lr_end=lr_end)
+
+    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
diff --git a/fengshen/models/roformer/__init__.py b/fengshen/models/roformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55c090f25446ec2cf60d632dacdb53a8928e25e
--- /dev/null
+++ b/fengshen/models/roformer/__init__.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.file_utils import _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_roformer": ["RoFormerConfig"],
+    "tokenization_roformer": ["RoFormerTokenizer"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_roformer"] = [
+        "RoFormerModel",
+        "RoFormerForMaskedLM",
+        "RoFormerForMultipleChoice",
+        "RoFormerPreTrainedModel",
+        "RoFormerForQuestionAnswering",
+        "RoFormerForSequenceClassification",
+        "RoFormerForTokenClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_roformer import RoFormerConfig
+    from .tokenization_roformer import RoFormerTokenizer
+
+    if is_torch_available():
+        from .modeling_roformer import (
+            RoFormerModel,
+            RoFormerForMaskedLM,
+            RoFormerForMultipleChoice,
+            RoFormerPreTrainedModel,
+            RoFormerForQuestionAnswering,
+            RoFormerForSequenceClassification,
+            RoFormerForTokenClassification,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure)
diff --git a/fengshen/models/roformer/configuration_roformer.py b/fengshen/models/roformer/configuration_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4818b31bd215b11d4ca952437869319fc25ae5b5
--- /dev/null
+++ b/fengshen/models/roformer/configuration_roformer.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RoFormer model configuration """
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+RoFormer_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    # See all RoFormer models at https://huggingface.co./models?filter=bert
+}
+
+
+class RoFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.RoFormerModel`. It is
+    used to instantiate a RoFormer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoFormer
+    `megatron-bert-uncased-345m <https://huggingface.co./nvidia/megatron-bert-uncased-345m>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 29056):
+            Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented
+            by the :obj:`inputs_ids` passed when calling :class:`~transformers.RoFormerModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.RoFormerModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+
+    Examples::
+
+        >>> from transformers import RoFormerModel, RoFormerConfig
+
+        >>> # Initializing a RoFormer bert-base-uncased style configuration
+        >>> configuration = RoFormerConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = RoFormerModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "roformer"
+
+    def __init__(
+        self,
+        vocab_size=29056,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        gradient_checkpointing=False,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.gradient_checkpointing = gradient_checkpointing
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
diff --git a/fengshen/models/roformer/modeling_roformer.py b/fengshen/models/roformer/modeling_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f67d34c6484108890f21983a0924b3a748e97b4
--- /dev/null
+++ b/fengshen/models/roformer/modeling_roformer.py
@@ -0,0 +1,1954 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch RoFormer model. """
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from .configuration_roformer import RoFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "RoFormerConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+_CHECKPOINT_FOR_DOC = "nvidia/megatron-bert-cased-345m"
+
+RoFormer_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "nvidia/megatron-bert-cased-345m",
+    # See all RoFormer models at https://huggingface.co./models?filter=RoFormer
+]
+
+
+def load_tf_weights_in_RoFormer(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer",
+                  "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class RoFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        # @IDEA modified -> roformer removed the position_embedding, and add the totary position embedding in the self_attention_layer
+        # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+
+        # In Megatron, layer-norm is applied after the 1st dropout.
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(
+            config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute")
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length: seq_length + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        # @IDEA modified -> roformer removed the position_embedding
+        # if self.position_embedding_type == "absolute":
+        #     position_embeddings = self.position_embeddings(position_ids)
+        #     embeddings += position_embeddings
+
+        # Megatron BERT moves that layer norm after the drop-out (and to each layer).
+        # embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RoPEmbedding(nn.Module):
+    def __init__(self, d_model):
+        super(RoPEmbedding, self).__init__()
+        self.d_model = d_model
+        div_term = torch.exp(torch.arange(
+            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        self.register_buffer('div_term', div_term)
+
+    def forward(self, x, seq_dim=0):
+        # x 是 [s, b, np, hn]，例如query和key
+        x = x.permute(2, 1, 0, 3)
+        t = torch.arange(x.size(seq_dim), device=x.device).type_as(
+            self.div_term)
+        sinusoid_inp = torch.outer(t, self.div_term)
+        sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()  # [s, hn]
+        o_shape = (sin.size(0), 1, 1, sin.size(1))
+        sin, cos = sin.view(*o_shape), cos.view(*o_shape)  # [s, 1, 1, hn]
+        sin = torch.repeat_interleave(sin, 2, dim=-1)
+        cos = torch.repeat_interleave(cos, 2, dim=-1)
+        x2 = torch.stack([-x[..., 1::2], x[..., ::2]], dim=-1).reshape_as(x)
+        x = cos * x + sin * x2
+        return x.permute(2, 1, 0, 3)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RoFormer
+class RoFormerSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(
+            config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size)
+        # @IDEA modified -> add rope positional embedding
+        self.rope_emb = RoPEmbedding(self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[
+            :-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+
+        # @IDEA modified -> add rope positional embedding
+        # print('query_layer.shape')
+        # print(query_layer.shape)
+        # query_layer.hsape -> [batch_size,num_head,seq_len,per_head_hidden_size]
+        query_layer = self.rope_emb(query_layer)
+        key_layer = self.rope_emb(key_layer)
+
+        attention_scores = torch.matmul(
+            query_layer, key_layer.transpose(-1, -2))
+
+        """ @IDEA modified -> removed the megatron positional
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        """
+
+        attention_scores = attention_scores / \
+            math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RoFormerModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[
+            :-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (
+            context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to RoFormerAttention below.
+class RoFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, residual):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return residual + hidden_states
+
+
+# Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
+class RoFormerAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.self = RoFormerSelfAttention(config)
+        self.output = RoFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - \
+            len(heads)
+        self.self.all_head_size = self.self.attention_head_size * \
+            self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        ln_outputs = self.ln(hidden_states)
+        self_outputs = self.self(
+            ln_outputs,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->RoFormer
+class RoFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to RoFormerLayer below.
+class RoFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return input_tensor + hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.
+class RoFormerLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RoFormerAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = RoFormerAttention(config)
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.intermediate = RoFormerIntermediate(config)
+        self.output = RoFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            # add self attentions if we output attention weights
+            outputs = self_attention_outputs[1:]
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:
+                                                       ] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            # add cross attentions if we output attention weights
+            outputs = outputs + cross_attention_outputs[1:-1]
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        ln_output = self.ln(attention_output)
+        intermediate_output = self.intermediate(ln_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+def roformer_extended_attention_mask(attention_mask, tokentype_ids):
+    # copy from bert_model.py and
+    # https://github.com/bojone/bert4keras/blob/8836dc01fa99aa54947a15db5aa60a0ab6c0c036/bert4keras/models.py#L382
+    # We create a 3D attention mask from a 2D tensor mask.
+    # [b, 1, s]
+    attention_mask_b1s = attention_mask.unsqueeze(1)
+    # [b, s, 1]
+    attention_mask_bs1 = attention_mask.unsqueeze(2)
+    # [b, s, s]
+    padding_mask_bss = attention_mask_b1s * attention_mask_bs1
+
+    # Convert attention mask to binary:
+    padding_mask_bss = (padding_mask_bss < 0.5)
+
+    # 根据tokentype_ids来获取相应的双向或者单向mask，注意
+    # 这里改变了原本实现中的小于等于号，因为megatron中的mask
+    # 中非mask部分为0，mask部分为1
+    idx = torch.cumsum(tokentype_ids, dim=1)
+    causal_mask = idx[:, None, :] > idx[:, :, None]
+    # 合并两个mask
+    mask = torch.logical_or(causal_mask, padding_mask_bss)
+    mask = mask.unsqueeze(1)  # [b, 1, s, s]
+    return mask
+
+
+class RoFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RoFormerLayer(config)
+                                   for _ in range(config.num_hidden_layers)])
+
+        # The final layer norm. We removed the 1st LN, moved LN to each hidden layer and this one
+        # is simply the final LN (Transformer's BERT has it attached to each hidden layer).
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            # Because we moved the layer-norm at the end of the hidden layer, we have non-normali-
+            # zed data here. If that's really needed, we must apply LN to match Transformer's BERT.
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + \
+                        (layer_outputs[2],)
+
+        # Finalize the hidden states.
+        hidden_states = self.ln(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->RoFormer
+class RoFormerPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->RoFormer
+class RoFormerPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->RoFormer
+class RoFormerLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = RoFormerPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->RoFormer
+class RoFormerOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RoFormerLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->RoFormer
+class RoFormerOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->RoFormer
+class RoFormerPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RoFormerLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class RoFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RoFormerConfig
+    load_tf_weights = load_tf_weights_in_RoFormer
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+@dataclass
+# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->RoFormer
+class RoFormerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.RoFormerForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+RoFormer_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.RoFormerConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+RoFormer_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerModel(RoFormerPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RoFormerEmbeddings(config)
+        self.encoder = RoFormerEncoder(config)
+
+        self.pooler = RoFormerPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # @IDEA modified -> get_extended_attention_mask -> roformer_extended_attention_mask
+        extended_attention_mask = roformer_extended_attention_mask(
+            attention_mask, token_type_ids)
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        """
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (
+                encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(
+            head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `next sentence prediction (classification)` head.
+    """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForPreTraining(RoFormerPreTrainedModel):
+    def __init__(self, config, add_binary_head=True):
+        super().__init__(config)
+
+        self.bert = RoFormerModel(config)
+        self.cls = RoFormerPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RoFormerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, RoFormerForPreTraining
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = RoFormerForPreTraining.from_pretrained('nvidia/megatron-bert-cased-345m')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return RoFormerForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """RoFormer Model with a `language modeling` head on top for CLM fine-tuning. """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForCausalLM(RoFormerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                "If you want to use `RoFormerForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.bert = RoFormerModel(config, add_pooling_layer=False)
+        self.cls = RoFormerOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, RoFormerForCausalLM, RoFormerConfig
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = RoFormerLMHeadModel.from_pretrained('nvidia/megatron-bert-cased-345m', is_decoder=True)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:,
+                                                          :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx)
+                               for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""RoFormer Model with a `language modeling` head on top. """, RoFormer_START_DOCSTRING)
+class RoFormerForMaskedLM(RoFormerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RoFormerForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = RoFormerModel(config, add_pooling_layer=False)
+        self.cls = RoFormerOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat(
+            [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """RoFormer Model with a `next sentence prediction (classification)` head on top. """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForNextSentencePrediction(RoFormerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"predictions"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = RoFormerModel(config)
+        self.cls = RoFormerOnlyNSPHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, RoFormerForNextSentencePrediction
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = RoFormerForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-cased-345m')
+
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForSequenceClassification(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = RoFormerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output
+    and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForMultipleChoice(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = RoFormerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        RoFormer_INPUTS_DOCSTRING.format(
+            "batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)
+                                   ) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)
+                                         ) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForTokenClassification(RoFormerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = RoFormerModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(
+                        loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    RoFormer_START_DOCSTRING,
+)
+class RoFormerForQuestionAnswering(RoFormerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = RoFormerModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(RoFormer_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/fengshen/models/roformer/tokenization_roformer.py b/fengshen/models/roformer/tokenization_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9267367e256b46fccc0ad196c326d28c0ebb0c
--- /dev/null
+++ b/fengshen/models/roformer/tokenization_roformer.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import BertTokenizer as RoFormerTokenizer
diff --git a/fengshen/models/tagging_models/bert_for_tagging.py b/fengshen/models/tagging_models/bert_for_tagging.py
new file mode 100644
index 0000000000000000000000000000000000000000..0331cf5fd3868d7a5da92e09bf489685224bd512
--- /dev/null
+++ b/fengshen/models/tagging_models/bert_for_tagging.py
@@ -0,0 +1,204 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .layers.crf import CRF
+from .layers.bert_output import BiaffineClassifierOutput, TokenClassifierOutput, SpanClassifierOutput
+from transformers import BertPreTrainedModel
+from transformers import BertModel
+from .layers.linears import PoolerEndLogits, PoolerStartLogits, Biaffine
+from torch.nn import CrossEntropyLoss
+from .losses.focal_loss import FocalLoss
+from .losses.label_smoothing import LabelSmoothingCrossEntropy
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'IDEA-CCNL/BertCrf': '/cognitive_comp/lujunyu/NER/outputs/ccks_crf/bert/best_checkpoint/pytorch_model.bin',
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'IDEA-CCNL/BertCrf': '/cognitive_comp/lujunyu/NER/outputs/ccks_crf/bert/best_checkpoint/config.json',
+}
+
+class BertLinear(BertPreTrainedModel):
+    def __init__(self, config, num_labels, loss_type):
+        super(BertLinear, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.loss_type = loss_type
+
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, input_len=None,
+                position_ids=None, head_mask=None, labels=None):
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,output_hidden_states=True)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss=None
+
+        if labels is not None:
+            assert self.loss_type in ['lsr', 'focal', 'ce']
+            if self.loss_type == 'lsr':
+                loss_fct = LabelSmoothingCrossEntropy(ignore_index=0)
+            elif self.loss_type == 'focal':
+                loss_fct = FocalLoss(ignore_index=0)
+            else:
+                loss_fct = CrossEntropyLoss(ignore_index=0)
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        return TokenClassifierOutput(loss, logits)  # (loss), scores, (hidden_states), (attentions)
+        
+class BertCrf(BertPreTrainedModel):
+    def __init__(self, config, num_labels, loss_type):
+        super(BertCrf, self).__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.crf = CRF(num_tags=num_labels, batch_first=True)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_len=None):
+        outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        
+        loss=None
+        if labels is not None:
+            loss = -1 * self.crf(emissions = logits, tags=labels, mask=attention_mask)
+
+        return TokenClassifierOutput(loss, logits)
+
+class BertBiaffine(BertPreTrainedModel):
+    def __init__(self, config, num_labels, loss_type):
+        super(BertBiaffine, self).__init__(config)
+        self.num_labels=num_labels
+        self.bert = BertModel(config)
+        self.start_layer=torch.nn.Sequential(torch.nn.Linear(in_features=config.hidden_size, out_features=128),torch.nn.ReLU())
+        self.end_layer=torch.nn.Sequential(torch.nn.Linear(in_features=config.hidden_size, out_features=128),torch.nn.ReLU())
+        self.biaffne_layer = Biaffine(128,self.num_labels)
+
+        self.lstm = nn.LSTM(config.hidden_size, config.hidden_size//2, num_layers=2, dropout=0.5,
+                             batch_first=True, bidirectional=True)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.loss_type = loss_type
+                
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, span_labels=None, span_mask=None, input_len=None):
+        outputs=self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
+        sequence_output=outputs[0]
+        sequence_output=self.dropout(self.lstm(sequence_output)[0])
+
+        start_logits=self.start_layer(sequence_output)
+        end_logits=self.end_layer(sequence_output)
+
+        span_logits=self.biaffne_layer(start_logits,end_logits)
+        # breakpoint()
+        span_loss=None
+        if span_labels is not None:
+            assert self.loss_type in ['lsr', 'focal', 'ce']
+            if self.loss_type == 'lsr':
+                loss_fct = LabelSmoothingCrossEntropy(ignore_index=0)
+            elif self.loss_type == 'focal':
+                loss_fct = FocalLoss(ignore_index=0)
+            else:
+                loss_fct = CrossEntropyLoss(ignore_index=0)
+            
+            span_logits=span_logits.contiguous()
+
+            active_loss=span_mask.view(-1) == 1
+            active_logits = span_logits.view(-1, self.num_labels)[active_loss]
+            active_labels = span_labels.view(-1)[active_loss]
+            span_loss = 10*loss_fct(active_logits, active_labels)
+
+        return BiaffineClassifierOutput(loss=span_loss,span_logits=span_logits)
+
+
+class BertSpan(BertPreTrainedModel):
+    def __init__(self, config, num_labels, loss_type, soft_label=True):
+        super(BertSpan, self).__init__(config)
+        self.soft_label = soft_label
+        self.num_labels = num_labels
+        self.loss_type = loss_type
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels)
+        if self.soft_label:
+            self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels)
+        else:
+            self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None, subjects=None, input_len=None):
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        start_logits = self.start_fc(sequence_output)
+        if start_positions is not None and self.training:
+            if self.soft_label:
+                batch_size = input_ids.size(0)
+                seq_len = input_ids.size(1)
+                label_logits = torch.FloatTensor(batch_size, seq_len, self.num_labels)
+                label_logits.zero_()
+                label_logits = label_logits.to(input_ids.device)
+                label_logits.scatter_(2, start_positions.unsqueeze(2), 1)
+            else:
+                label_logits = start_positions.unsqueeze(2).float()
+        else:
+            label_logits = F.softmax(start_logits, -1)
+            if not self.soft_label:
+                label_logits = torch.argmax(label_logits, -1).unsqueeze(2).float()
+        end_logits = self.end_fc(sequence_output, label_logits)
+
+        total_loss=None
+        if start_positions is not None and end_positions is not None:
+            assert self.loss_type in ['lsr', 'focal', 'ce']
+            if self.loss_type =='lsr':
+                loss_fct = LabelSmoothingCrossEntropy()
+            elif self.loss_type == 'focal':
+                loss_fct = FocalLoss()
+            else:
+                loss_fct = CrossEntropyLoss()
+
+            active_loss = attention_mask.view(-1) == 1
+            active_start_logits = start_logits.view(-1, self.num_labels)[active_loss]
+            active_end_logits = end_logits.view(-1, self.num_labels)[active_loss]
+
+            active_start_labels = start_positions.view(-1)[active_loss]
+            active_end_labels = end_positions.view(-1)[active_loss]
+
+            start_loss = loss_fct(active_start_logits, active_start_labels)
+            end_loss = loss_fct(active_end_logits, active_end_labels)
+            total_loss = (start_loss + end_loss) / 2
+
+        return SpanClassifierOutput(loss=total_loss,start_logits=start_logits,end_logits=end_logits)
+
+
+# class BertLstmCrf(BertPreTrainedModel):
+#     def __init__(self, config):
+#         super(BertLstmCrf, self).__init__(config)
+#         self.bert = BertModel(config)
+#         self.lstm = nn.LSTM(config.hidden_size, config.hidden_size//2, 2,
+#                              batch_first=True, bidirectional=True)
+#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+#         self.dropout1 = nn.Dropout(config.hidden_dropout_prob)
+#         self.dropout2 = nn.Dropout(config.hidden_dropout_prob)
+#         self.crf = CRF(num_tags=config.num_labels, batch_first=True)
+#         self.layernorm = nn.LayerNorm(config.hidden_size)
+
+#     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, input_lens=None):
+#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+#         sequence_output =outputs[0]
+
+#         sequence_output = self.dropout1(sequence_output)
+#         sequence_output = self.dropout2(self.lstm(sequence_output)[0])
+#         logits = self.classifier(self.layernorm(sequence_output))
+#         outputs = (logits,)
+#         if labels is not None:
+#             loss = self.crf(emissions=logits, tags=labels, mask=attention_mask)
+#             outputs = (-1 * loss,) + outputs
+#         return outputs
\ No newline at end of file
diff --git a/fengshen/models/tagging_models/layers/bert_output.py b/fengshen/models/tagging_models/layers/bert_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..9564c0200cf17e31fba92159750a3ee41ac191b5
--- /dev/null
+++ b/fengshen/models/tagging_models/layers/bert_output.py
@@ -0,0 +1,30 @@
+import torch
+from dataclasses import dataclass
+from typing import Optional
+
+@dataclass
+class TokenClassifierOutput:
+    """
+    Base class for outputs of token classification models.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+
+
+@dataclass
+class SpanClassifierOutput:
+    """
+    Base class for outputs of span classification models.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    start_logits: torch.LongTensor = None
+    end_logits: torch.LongTensor = None
+
+
+@dataclass
+class BiaffineClassifierOutput:
+    """
+    Base class for outputs of span classification models.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    span_logits: torch.FloatTensor = None
diff --git a/fengshen/models/tagging_models/layers/crf.py b/fengshen/models/tagging_models/layers/crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8b3adcc988898a74426bda2412ad101aa804bda
--- /dev/null
+++ b/fengshen/models/tagging_models/layers/crf.py
@@ -0,0 +1,411 @@
+import torch
+import torch.nn as nn
+from typing import List, Optional
+
+class CRF(nn.Module):
+    """Conditional random field.
+    This module implements a conditional random field [LMP01]_. The forward computation
+    of this class computes the log likelihood of the given sequence of tags and
+    emission score tensor. This class also has `~CRF.decode` method which finds
+    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
+    Args:
+        num_tags: Number of tags.
+        batch_first: Whether the first dimension corresponds to the size of a minibatch.
+    Attributes:
+        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
+            ``(num_tags,)``.
+        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
+            ``(num_tags,)``.
+        transitions (`~torch.nn.Parameter`): Transition score tensor of size
+            ``(num_tags, num_tags)``.
+    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
+       "Conditional random fields: Probabilistic models for segmenting and
+       labeling sequence data". *Proc. 18th International Conf. on Machine
+       Learning*. Morgan Kaufmann. pp. 282–289.
+    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
+    """
+
+    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
+        if num_tags <= 0:
+            raise ValueError(f'invalid number of tags: {num_tags}')
+        super().__init__()
+        self.num_tags = num_tags
+        self.batch_first = batch_first
+        self.start_transitions = nn.Parameter(torch.empty(num_tags))
+        self.end_transitions = nn.Parameter(torch.empty(num_tags))
+        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """Initialize the transition parameters.
+        The parameters will be initialized randomly from a uniform distribution
+        between -0.1 and 0.1.
+        """
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(num_tags={self.num_tags})'
+
+    def forward(self, emissions: torch.Tensor,
+                tags: torch.LongTensor,
+                mask: Optional[torch.ByteTensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+        """Compute the conditional log likelihood of a sequence of tags given emission scores.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            tags (`~torch.LongTensor`): Sequence of tags tensor of size
+                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            reduction: Specifies  the reduction to apply to the output:
+                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
+                ``sum``: the output will be summed over batches. ``mean``: the output will be
+                averaged over batches. ``token_mean``: the output will be averaged over tokens.
+        Returns:
+            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
+            reduction is ``none``, ``()`` otherwise.
+        """
+        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
+            raise ValueError(f'invalid reduction: {reduction}')
+        if mask is None:
+            mask = torch.ones_like(tags, dtype=torch.uint8, device=tags.device)
+        if mask.dtype != torch.uint8:
+            mask = mask.byte()
+        self._validate(emissions, tags=tags, mask=mask)
+
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+
+        # shape: (batch_size,)
+        numerator = self._compute_score(emissions, tags, mask)
+        # shape: (batch_size,)
+        denominator = self._compute_normalizer(emissions, mask)
+        # shape: (batch_size,)
+        llh = numerator - denominator
+
+        if reduction == 'none':
+            return llh
+        if reduction == 'sum':
+            return llh.sum()
+        if reduction == 'mean':
+            return llh.mean()
+        return llh.sum() / mask.float().sum()
+
+    def decode(self, emissions: torch.Tensor,
+               mask: Optional[torch.ByteTensor] = None,
+               nbest: Optional[int] = None,
+               pad_tag: Optional[int] = None) -> List[List[List[int]]]:
+        """Find the most likely tag sequence using Viterbi algorithm.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            nbest (`int`): Number of most probable paths for each sequence
+            pad_tag (`int`): Tag at padded positions. Often input varies in length and
+                the length will be padded to the maximum length in the batch. Tags at
+                the padded positions will be assigned with a padding tag, i.e. `pad_tag`
+        Returns:
+            A PyTorch tensor of the best tag sequence for each batch of shape
+            (nbest, batch_size, seq_length)
+        """
+        if nbest is None:
+            nbest = 1
+        if mask is None:
+            mask = torch.ones(emissions.shape[:2], dtype=torch.uint8,
+                              device=emissions.device)
+        if mask.dtype != torch.uint8:
+            mask = mask.byte()
+        self._validate(emissions, mask=mask)
+
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+
+        if nbest == 1:
+            return self._viterbi_decode(emissions, mask, pad_tag).unsqueeze(0)
+        return self._viterbi_decode_nbest(emissions, mask, nbest, pad_tag)
+
+    def _validate(self, emissions: torch.Tensor,
+                  tags: Optional[torch.LongTensor] = None,
+                  mask: Optional[torch.ByteTensor] = None) -> None:
+        if emissions.dim() != 3:
+            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
+        if emissions.size(2) != self.num_tags:
+            raise ValueError(
+                f'expected last dimension of emissions is {self.num_tags}, '
+                f'got {emissions.size(2)}')
+
+        if tags is not None:
+            if emissions.shape[:2] != tags.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and tags must match, '
+                    f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')
+
+        if mask is not None:
+            if emissions.shape[:2] != mask.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and mask must match, '
+                    f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
+            no_empty_seq = not self.batch_first and mask[0].all()
+            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
+            if not no_empty_seq and not no_empty_seq_bf:
+                raise ValueError('mask of the first timestep must all be on')
+
+    def _compute_score(self, emissions: torch.Tensor,
+                       tags: torch.LongTensor,
+                       mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # tags: (seq_length, batch_size)
+        # mask: (seq_length, batch_size)
+        seq_length, batch_size = tags.shape
+        mask = mask.float()
+
+        # Start transition score and first emission
+        # shape: (batch_size,)
+        score = self.start_transitions[tags[0]]
+        score += emissions[0, torch.arange(batch_size), tags[0]]
+
+        for i in range(1, seq_length):
+            # Transition score to next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += self.transitions[tags[i - 1], tags[i]] * mask[i]
+
+            # Emission score for next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
+
+        # End transition score
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+        # shape: (batch_size,)
+        last_tags = tags[seq_ends, torch.arange(batch_size)]
+        # shape: (batch_size,)
+        score += self.end_transitions[last_tags]
+
+        return score
+
+    def _compute_normalizer(self, emissions: torch.Tensor,
+                            mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        seq_length = emissions.size(0)
+
+        # Start transition score and first emission; score has size of
+        # (batch_size, num_tags) where for each batch, the j-th column stores
+        # the score that the first timestep has tag j
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+
+        for i in range(1, seq_length):
+            # Broadcast score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emissions = emissions[i].unsqueeze(1)
+
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the sum of scores of all
+            # possible tag sequences so far that end with transitioning from tag i to tag j
+            # and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emissions
+
+            # Sum over all possible current tags, but we're in score space, so a sum
+            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
+            # all possible tag sequences so far, that end in tag i
+            # shape: (batch_size, num_tags)
+            next_score = torch.logsumexp(next_score, dim=1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+
+        # End transition score
+        # shape: (batch_size, num_tags)
+        score += self.end_transitions
+
+        # Sum (log-sum-exp) over all possible tags
+        # shape: (batch_size,)
+        return torch.logsumexp(score, dim=1)
+
+    def _viterbi_decode(self, emissions: torch.FloatTensor,
+                        mask: torch.ByteTensor,
+                        pad_tag: Optional[int] = None) -> List[List[int]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        # return: (batch_size, seq_length)
+        if pad_tag is None:
+            pad_tag = 0
+
+        device = emissions.device
+        seq_length, batch_size = mask.shape
+
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history_idx = torch.zeros((seq_length, batch_size, self.num_tags),
+                                  dtype=torch.long, device=device)
+        oor_idx = torch.zeros((batch_size, self.num_tags),
+                              dtype=torch.long, device=device)
+        oor_tag = torch.full((seq_length, batch_size), pad_tag,
+                             dtype=torch.long, device=device)
+
+        # - score is a tensor of size (batch_size, num_tags) where for every batch,
+        #   value at column j stores the score of the best tag sequence so far that ends
+        #   with tag j
+        # - history_idx saves where the best tags candidate transitioned from; this is used
+        #   when we trace back the best tag sequence
+        # - oor_idx saves the best tags candidate transitioned from at the positions
+        #   where mask is 0, i.e. out of range (oor)
+
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            # Broadcast viterbi score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emission = emissions[i].unsqueeze(1)
+
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the score of the best
+            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emission
+
+            # Find the maximum score over all possible current tag
+            # shape: (batch_size, num_tags)
+            next_score, indices = next_score.max(dim=1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(-1), next_score, score)
+            indices = torch.where(mask[i].unsqueeze(-1), indices, oor_idx)
+            history_idx[i - 1] = indices
+
+        # End transition score
+        # shape: (batch_size, num_tags)
+        end_score = score + self.end_transitions
+        _, end_tag = end_score.max(dim=1)
+
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+
+        # insert the best tag at each sequence end (last position with mask == 1)
+        history_idx = history_idx.transpose(1, 0).contiguous()
+        history_idx.scatter_(1, seq_ends.view(-1, 1, 1).expand(-1, 1, self.num_tags),
+                             end_tag.view(-1, 1, 1).expand(-1, 1, self.num_tags))
+        history_idx = history_idx.transpose(1, 0).contiguous()
+
+        # The most probable path for each sequence
+        best_tags_arr = torch.zeros((seq_length, batch_size),
+                                    dtype=torch.long, device=device)
+        best_tags = torch.zeros(batch_size, 1, dtype=torch.long, device=device)
+        for idx in range(seq_length - 1, -1, -1):
+            best_tags = torch.gather(history_idx[idx], 1, best_tags)
+            best_tags_arr[idx] = best_tags.data.view(batch_size)
+
+        return torch.where(mask, best_tags_arr, oor_tag).transpose(0, 1)
+
+    def _viterbi_decode_nbest(self, emissions: torch.FloatTensor,
+                              mask: torch.ByteTensor,
+                              nbest: int,
+                              pad_tag: Optional[int] = None) -> List[List[List[int]]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        # return: (nbest, batch_size, seq_length)
+        if pad_tag is None:
+            pad_tag = 0
+
+        device = emissions.device
+        seq_length, batch_size = mask.shape
+
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history_idx = torch.zeros((seq_length, batch_size, self.num_tags, nbest),
+                                  dtype=torch.long, device=device)
+        oor_idx = torch.zeros((batch_size, self.num_tags, nbest),
+                              dtype=torch.long, device=device)
+        oor_tag = torch.full((seq_length, batch_size, nbest), pad_tag,
+                             dtype=torch.long, device=device)
+
+        # + score is a tensor of size (batch_size, num_tags) where for every batch,
+        #   value at column j stores the score of the best tag sequence so far that ends
+        #   with tag j
+        # + history_idx saves where the best tags candidate transitioned from; this is used
+        #   when we trace back the best tag sequence
+        # - oor_idx saves the best tags candidate transitioned from at the positions
+        #   where mask is 0, i.e. out of range (oor)
+
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            if i == 1:
+                broadcast_score = score.unsqueeze(-1)
+                broadcast_emission = emissions[i].unsqueeze(1)
+                # shape: (batch_size, num_tags, num_tags)
+                next_score = broadcast_score + self.transitions + broadcast_emission
+            else:
+                broadcast_score = score.unsqueeze(-1)
+                broadcast_emission = emissions[i].unsqueeze(1).unsqueeze(2)
+                # shape: (batch_size, num_tags, nbest, num_tags)
+                next_score = broadcast_score + self.transitions.unsqueeze(1) + broadcast_emission
+
+            # Find the top `nbest` maximum score over all possible current tag
+            # shape: (batch_size, nbest, num_tags)
+            next_score, indices = next_score.view(batch_size, -1, self.num_tags).topk(nbest, dim=1)
+
+            if i == 1:
+                score = score.unsqueeze(-1).expand(-1, -1, nbest)
+                indices = indices * nbest
+
+            # convert to shape: (batch_size, num_tags, nbest)
+            next_score = next_score.transpose(2, 1)
+            indices = indices.transpose(2, 1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags, nbest)
+            score = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), next_score, score)
+            indices = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), indices, oor_idx)
+            history_idx[i - 1] = indices
+
+        # End transition score shape: (batch_size, num_tags, nbest)
+        end_score = score + self.end_transitions.unsqueeze(-1)
+        _, end_tag = end_score.view(batch_size, -1).topk(nbest, dim=1)
+
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+
+        # insert the best tag at each sequence end (last position with mask == 1)
+        history_idx = history_idx.transpose(1, 0).contiguous()
+        history_idx.scatter_(1, seq_ends.view(-1, 1, 1, 1).expand(-1, 1, self.num_tags, nbest),
+                             end_tag.view(-1, 1, 1, nbest).expand(-1, 1, self.num_tags, nbest))
+        history_idx = history_idx.transpose(1, 0).contiguous()
+
+        # The most probable path for each sequence
+        best_tags_arr = torch.zeros((seq_length, batch_size, nbest),
+                                    dtype=torch.long, device=device)
+        best_tags = torch.arange(nbest, dtype=torch.long, device=device) \
+                         .view(1, -1).expand(batch_size, -1)
+        for idx in range(seq_length - 1, -1, -1):
+            best_tags = torch.gather(history_idx[idx].view(batch_size, -1), 1, best_tags)
+            best_tags_arr[idx] = best_tags.data.view(batch_size, -1) // nbest
+
+        return torch.where(mask.unsqueeze(-1), best_tags_arr, oor_tag).permute(2, 1, 0)
\ No newline at end of file
diff --git a/fengshen/models/tagging_models/layers/linears.py b/fengshen/models/tagging_models/layers/linears.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1291af08883d3c1cc07f618101ba44e79e562b9
--- /dev/null
+++ b/fengshen/models/tagging_models/layers/linears.py
@@ -0,0 +1,67 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class FeedForwardNetwork(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0):
+        super(FeedForwardNetwork, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.linear1 = nn.Linear(input_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        x_proj = F.dropout(F.relu(self.linear1(x)), p=self.dropout_rate, training=self.training)
+        x_proj = self.linear2(x_proj)
+        return x_proj
+
+
+class PoolerStartLogits(nn.Module):
+    def __init__(self, hidden_size, num_classes):
+        super(PoolerStartLogits, self).__init__()
+        self.dense = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, hidden_states, p_mask=None):
+        x = self.dense(hidden_states)
+        return x
+
+class PoolerEndLogits(nn.Module):
+    def __init__(self, hidden_size, num_classes):
+        super(PoolerEndLogits, self).__init__()
+        self.dense_0 = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(hidden_size)
+        self.dense_1 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, hidden_states, start_positions=None, p_mask=None):
+        x = self.dense_0(torch.cat([hidden_states, start_positions], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x)
+        return x
+
+class Biaffine(nn.Module):
+    def __init__(self, in_size, out_size, bias_x=True, bias_y=True):
+        super().__init__()
+        self.bias_x = bias_x
+        self.bias_y = bias_y
+        self.out_size = out_size
+        self.U = torch.nn.Parameter(torch.randn(in_size + int(bias_x),out_size,in_size + int(bias_y)))
+        # self.U1 = self.U.view(size=(in_size + int(bias_x),-1))
+        #U.shape = [in_size,out_size,in_size]  
+    def forward(self, x, y):
+        if self.bias_x:
+            x = torch.cat((x, torch.ones_like(x[..., :1])), dim=-1)
+        if self.bias_y:
+            y = torch.cat((y, torch.ones_like(y[..., :1])), dim=-1)
+        
+        """
+        batch_size,seq_len,hidden=x.shape
+        bilinar_mapping=torch.matmul(x,self.U)
+        bilinar_mapping=bilinar_mapping.view(size=(batch_size,seq_len*self.out_size,hidden))
+        y=torch.transpose(y,dim0=1,dim1=2)
+        bilinar_mapping=torch.matmul(bilinar_mapping,y)
+        bilinar_mapping=bilinar_mapping.view(size=(batch_size,seq_len,self.out_size,seq_len))
+        bilinar_mapping=torch.transpose(bilinar_mapping,dim0=2,dim1=3)
+        """
+        bilinar_mapping = torch.einsum('bxi,ioj,byj->bxyo', x, self.U, y)
+        return bilinar_mapping
\ No newline at end of file
diff --git a/fengshen/models/tagging_models/losses/__init__.py b/fengshen/models/tagging_models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..139597f9cb07c5d48bed18984ec4747f4b4f3438
--- /dev/null
+++ b/fengshen/models/tagging_models/losses/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/fengshen/models/tagging_models/losses/focal_loss.py b/fengshen/models/tagging_models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19124be42d93ab0a5529d469a4b2f900de01ccd
--- /dev/null
+++ b/fengshen/models/tagging_models/losses/focal_loss.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class FocalLoss(nn.Module):
+    '''Multi-class Focal loss implementation'''
+    def __init__(self, gamma=2, weight=None,ignore_index=-100):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.weight = weight
+        self.ignore_index=ignore_index
+
+    def forward(self, input, target):
+        """
+        input: [N, C]
+        target: [N, ]
+        """
+        logpt = F.log_softmax(input, dim=1)
+        # 指数表达式的y是log之前的，所以要加exp
+        pt = torch.exp(logpt)
+        logpt = (1-pt)**self.gamma * logpt
+        loss = F.nll_loss(logpt, target, self.weight,ignore_index=self.ignore_index)
+        return loss
diff --git a/fengshen/models/tagging_models/losses/label_smoothing.py b/fengshen/models/tagging_models/losses/label_smoothing.py
new file mode 100644
index 0000000000000000000000000000000000000000..448e83d93eec8268199ae61fc9ce1b3945ebe391
--- /dev/null
+++ b/fengshen/models/tagging_models/losses/label_smoothing.py
@@ -0,0 +1,21 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LabelSmoothingCrossEntropy(nn.Module):
+    def __init__(self, eps=0.1, reduction='mean',ignore_index=-100):
+        super(LabelSmoothingCrossEntropy, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+
+    def forward(self, output, target):
+        c = output.size()[-1]
+        log_preds = F.log_softmax(output, dim=-1)
+        if self.reduction=='sum':
+            loss = -log_preds.sum()
+        else:
+            loss = -log_preds.sum(dim=-1)
+            if self.reduction=='mean':
+                loss = loss.mean()
+        return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction,
+                                                           ignore_index=self.ignore_index)
\ No newline at end of file
diff --git a/fengshen/models/tcbert/__init__.py b/fengshen/models/tcbert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fengshen/models/tcbert/modeling_tcbert.py b/fengshen/models/tcbert/modeling_tcbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c067852f14ae49de40f60a5d605f6c52246b5d1
--- /dev/null
+++ b/fengshen/models/tcbert/modeling_tcbert.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import basicConfig
+import torch
+from torch import nn
+import json
+from tqdm import tqdm
+import os
+import numpy as np
+from transformers import BertTokenizer
+import pytorch_lightning as pl
+
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import trainer, loggers
+from torch.utils.data import Dataset, DataLoader
+from transformers.optimization import get_linear_schedule_with_warmup
+from transformers import BertForMaskedLM
+from transformers import AutoConfig
+from transformers.pipelines.base import Pipeline
+from transformers import MegatronBertForMaskedLM
+import argparse
+import copy
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import warnings
+from transformers import TextClassificationPipeline as HuggingfacePipe
+
+
+class TCBertDataset(Dataset):
+    def __init__(self, data, tokenizer, args, prompt, label_classes):
+        super().__init__()
+
+        self.tokenizer = tokenizer
+        self.max_length = args.max_length
+        self.num_labels = args.num_labels
+        self.data = data
+        self.args = args
+        self.label_classes = label_classes
+        self.prompt = prompt
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.encode(self.data[index])
+
+    
+    def encode(self, item, labeled=True):
+
+        if labeled:
+            ori_texta = self.prompt.format(item['label']) + item['content']
+            mask_texta = self.prompt.format("[MASK]" * len(item['label'])) + item['content']
+            # print('texta', texta)
+            labels = self.label_classes[item['label']]
+
+            ori_encode_dict = self.tokenizer.encode_plus(ori_texta,
+                                            max_length=self.max_length,
+                                            padding="longest",
+                                            truncation=True
+                                            )
+            
+            mask_encode_dict = self.tokenizer.encode_plus(mask_texta,
+                                            max_length=self.max_length,
+                                            padding="longest",
+                                            truncation=True
+                                            )
+
+            ori_input_ids = torch.tensor(ori_encode_dict['input_ids']).long()
+            token_type_ids = torch.tensor(ori_encode_dict['token_type_ids']).long()
+            attention_mask = torch.tensor(ori_encode_dict['attention_mask']).float()
+
+            mask_input_ids = torch.tensor(mask_encode_dict['input_ids']).long()
+            mlmlabels = torch.where(mask_input_ids == self.tokenizer.mask_token_id, ori_input_ids, -100)
+
+            labels = torch.tensor(labels).long()
+            mlmlabels = torch.tensor(mlmlabels).long()
+
+            encoded = {
+                "sentence": item["content"],
+                "input_ids": mask_input_ids,
+                "token_type_ids": token_type_ids,
+                "attention_mask": attention_mask,
+                "labels": labels,
+                "mlmlabels": mlmlabels,
+            }
+
+        else:
+
+            texta = self.prompt.format("[MASK]" * self.args.fixed_lablen)  + item['content']
+
+            encode_dict = self.tokenizer.encode_plus(texta,
+                                                max_length=self.max_length,
+                                                padding="longest",
+                                                truncation=True
+                                                )
+            
+            input_ids = encode_dict['input_ids']
+            token_type_ids = encode_dict['token_type_ids']
+            attention_mask = encode_dict['attention_mask']
+
+            encoded = {
+                "sentence": item["content"],
+                "input_ids": torch.tensor(input_ids).long(),
+                "token_type_ids": torch.tensor(token_type_ids).long(),
+                "attention_mask": torch.tensor(attention_mask).float(),
+            }
+        return encoded
+
+
+
+class TCBertDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--batchsize', default=16, type=int)
+        parser.add_argument('--max_length', default=512, type=int)
+        parser.add_argument('--fixed_lablen', default=2, type=int)
+        return parent_args
+
+    def __init__(self, train_data, val_data, tokenizer, args, prompt, prompt_label):
+        super().__init__()
+        self.batchsize = args.batchsize
+        self.label_classes = self.get_label_classes(prompt_label)
+        args.num_labels = len(self.label_classes)
+
+        self.train_data = TCBertDataset(train_data, tokenizer, args, prompt, self.label_classes)
+        self.valid_data = TCBertDataset(val_data, tokenizer, args, prompt, self.label_classes)
+
+    def get_label_classes(self, prompt_label):
+        label_classes = {}
+        i = 0 
+        for key in prompt_label.keys():
+            label_classes[key] = i
+            i+=1
+        print("label_classes:",label_classes)
+        return label_classes
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, collate_fn=self.collate_fn, batch_size=self.batchsize, pin_memory=False)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, collate_fn=self.collate_fn, batch_size=self.batchsize, pin_memory=False)
+
+    def collate_fn(self, batch):
+        '''
+        Aggregate a batch data.
+        batch = [ins1_dict, ins2_dict, ..., insN_dict]
+        batch_data = {'sentence':[ins1_sentence, ins2_sentence...], 'input_ids':[ins1_input_ids, ins2_input_ids...], ...}
+        '''
+        batch_data = {}
+        for key in batch[0]:
+            batch_data[key] = [example[key] for example in batch]
+        input_ids = batch_data['input_ids']
+        attention_mask = batch_data['attention_mask']
+        token_type_ids = batch_data["token_type_ids"]
+        labels = None
+        if 'labels' in batch_data:
+            labels = torch.LongTensor(batch_data['labels'])
+
+        mlmlabels = None
+        if 'mlmlabels' in batch_data:
+            mlmlabels = nn.utils.rnn.pad_sequence(batch_data['mlmlabels'],
+                                                batch_first=True,
+                                                padding_value=-100)
+        
+        input_ids = nn.utils.rnn.pad_sequence(input_ids,
+                                                batch_first=True,
+                                                padding_value=0)
+            
+        token_type_ids = nn.utils.rnn.pad_sequence(token_type_ids,
+                                                    batch_first=True,
+                                                    padding_value=0)
+        attention_mask = nn.utils.rnn.pad_sequence(attention_mask,
+                                                    batch_first=True,
+                                                    padding_value=0)
+
+        batch_data = {
+            "sentence":batch_data["sentence"],
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "labels": labels,
+            "mlmlabels":mlmlabels
+        }
+
+        return batch_data
+        
+
+
+class TCBertModel(nn.Module):
+    def __init__(self, pre_train_dir, nlabels):
+        super().__init__()
+        self.config = AutoConfig.from_pretrained(pre_train_dir)
+        print("pre_train_dir", pre_train_dir)
+        # if self.config.model_type == 'megatron-bert':
+        if "1.3B" in pre_train_dir:
+            self.bert = MegatronBertForMaskedLM.from_pretrained(pre_train_dir)
+        else:
+            self.bert = BertForMaskedLM.from_pretrained(pre_train_dir)
+
+        self.dropout = nn.Dropout(0.1)
+        self.nlabels = nlabels
+        self.linear_classifier = nn.Linear(self.config.hidden_size, self.nlabels)
+
+    def forward(self, input_ids, attention_mask, token_type_ids, mlmlabels=None):
+
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            labels=mlmlabels,
+                            output_hidden_states=True)  # (bsz, seq, dim)
+
+        mlm_logits = outputs.logits
+        hidden_states = outputs.hidden_states[-1]
+        cls_logits = hidden_states[:,0]
+        cls_logits = self.dropout(cls_logits)
+
+        logits = self.linear_classifier(cls_logits)
+
+        return outputs.loss, logits, mlm_logits
+
+
+class TCBertLitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--learning_rate', default=1e-5, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        parser.add_argument('--num_labels', default=2, type=int)
+
+        return parent_args
+
+    def __init__(self, args, model_path, nlabels):
+        super().__init__()
+        self.args = args
+        self.loss_fn = torch.nn.CrossEntropyLoss()
+        self.model = TCBertModel(model_path, nlabels)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
+            self.total_step = int(self.trainer.max_epochs * self.num_data /
+                                  (max(1, num_gpus) * self.trainer.accumulate_grad_batches))
+            print('Total training step:', self.total_step)
+
+
+    def train_inputs(self, batch):
+        inputs = {
+            'input_ids': batch['input_ids'],
+            'attention_mask': batch['attention_mask'],
+            'token_type_ids': batch['token_type_ids'],
+            'mlmlabels': batch['mlmlabels']
+        }
+        return inputs 
+
+    def training_step(self, batch, batch_idx):
+        labels = batch['labels']
+        batch = self.train_inputs(batch)
+        mlm_loss, logits, _= self.model(**batch)
+        if labels is not None:
+            cls_loss = self.loss_fn(logits, labels.view(-1))
+
+        loss = cls_loss + mlm_loss
+
+        ntotal = logits.size(0)
+        ncorrect = (logits.argmax(dim=-1) == labels).long().sum()
+        acc = ncorrect / ntotal
+
+        self.log('train_loss', loss, on_step=True, prog_bar=True)
+        self.log("train_acc", acc, on_step=True, prog_bar=True)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        labels = batch['labels']
+        batch = self.train_inputs(batch)
+        mlm_loss, logits, _ = self.model(**batch)
+        predict = logits.argmax(dim=-1).cpu().tolist()
+
+        if labels is not None:
+            cls_loss = self.loss_fn(logits, labels.view(-1))
+
+        loss = cls_loss + mlm_loss
+        ntotal = logits.size(0)
+        
+        ncorrect = int((logits.argmax(dim=-1) == labels).long().sum())
+        acc = ncorrect / ntotal
+
+        self.log('valid_loss', loss, on_step=True, prog_bar=True)
+        self.log("valid_acc", acc, on_step=True, prog_bar=True)
+
+        return int(ncorrect), int(ntotal)
+
+    def configure_optimizers(self):
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.args.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_step * self.args.warmup),
+            self.total_step)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+
+
+class TCBertPredict:
+    def __init__(self, model, tokenizer, args, prompt, prompt_label):
+        self.tokenizer = tokenizer
+        self.args = args
+        self.data_model = TCBertDataModel(
+            [], [], tokenizer, args, prompt, prompt_label)
+        self.model = model
+    
+    def predict_inputs(self, batch):
+        #  Filter reduntant information(for example: 'sentence') that will be passed to model.forward()
+        inputs = {
+            'input_ids': batch['input_ids'].cuda(),
+            'attention_mask': batch['attention_mask'].cuda(),
+            'token_type_ids': batch['token_type_ids'].cuda(),
+        }
+        return inputs 
+
+    def predict(self, batch_data):
+        batch = [self.data_model.train_data.encode(
+            sample, labeled=False) for sample in batch_data]
+        batch = self.data_model.collate_fn(batch)
+        batch = self.predict_inputs(batch)
+        _, logits, _ = self.model.model(**batch)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        predicts = torch.argmax(probs, dim=-1).detach().cpu().numpy()
+
+        return predicts
+
diff --git a/fengshen/models/transfo_xl_denoise/__init__.py b/fengshen/models/transfo_xl_denoise/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fengshen/models/transfo_xl_denoise/configuration_transfo_xl_denoise.py b/fengshen/models/transfo_xl_denoise/configuration_transfo_xl_denoise.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd0e8bbbca977f23b3e77d51d6f7fe3fb2092cc
--- /dev/null
+++ b/fengshen/models/transfo_xl_denoise/configuration_transfo_xl_denoise.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TransfoXLDenoise model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+Transfo_XL_Denoise_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "transformer-xl-1b-base": "https://huggingface.co./transformer-xl-1b-base/resolve/main/config.json",
+    # See all TransfoXLDenoise models at https://huggingface.co./models?filter=transfo_xl_denoise
+}
+
+
+class TransfoXLDenoiseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~TransfoXLDenoiseModel`].
+    It is used to instantiate an TransfoXLDenoise model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the TransfoXLDenoise [transformer-xl-1b-base](https://huggingface.co./transformer-xl-1b-base) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the TransfoXLDenoise model. Defines the number of different
+            tokens that can be represented by the
+            `inputs_ids` passed when calling [`~TransfoXLDenoiseModel`] or
+            [`~TFTransfoXLDenoiseModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~TransfoXLDenoiseModel`] or
+            [`~TFTransfoXLDenoiseModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        Example:
+
+    ```python
+    >>> from transformers import TransfoXLDenoiseModel, TransfoXLDenoiseConfig
+
+    >>> # Initializing a TransfoXLDenoise transformer-xl-1b-base style configuration
+    >>> configuration = TransfoXLDenoiseConfig()
+
+    >>> # Initializing a model from the transformer-xl-1b-base style configuration
+    >>> model = TransfoXLDenoiseModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "transfo_xl_denoise"
+
+    def __init__(
+        self,
+        num_layers=32,
+        vocab_size=50048,
+        hidden_size=1600,
+        num_attention_heads=25,
+        embedding_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        output_dropout_prob=0.1,
+        max_sequence_length=512,
+        max_memory_length=512,
+        checkpoint_activations=False,
+        checkpoint_num_layers=1,
+        parallel_output=True,
+        relative_encoding=True,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.output_dropout_prob = output_dropout_prob
+        self.max_sequence_length = max_sequence_length
+        self.max_memory_length = max_memory_length
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.parallel_output = parallel_output
+        self.relative_encoding = relative_encoding
+        super().__init__(**kwargs)
diff --git a/fengshen/models/transfo_xl_denoise/generate.py b/fengshen/models/transfo_xl_denoise/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b768ff1baf6477735ac14fec9df58f7cd2724c6
--- /dev/null
+++ b/fengshen/models/transfo_xl_denoise/generate.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn.functional as F
+from fengshen.models.transfo_xl_denoise.tokenization_transfo_xl_denoise import TransfoXLDenoiseTokenizer
+from fengshen.models.transfo_xl_denoise.modeling_transfo_xl_denoise import TransfoXLDenoiseModel
+from fengshen.utils import top_k_logits, get_masks_and_position_ids
+
+
+def get_batch(context_tokens, mem_length, batch_size=1):
+    tokens = context_tokens
+    tokens = tokens.view(batch_size, -1).contiguous()
+    # Get the masks and postition ids.
+    attention_mask, position_ids = get_masks_and_position_ids(tokens, mem_length=mem_length)
+    return tokens, attention_mask, position_ids
+
+
+def denoise_generate(model,
+                     tokenizer,
+                     input_text,
+                     device=0,
+                     mem_length=512,
+                     temperature=1.,
+                     top_p=0.9,
+                     eod_token=50000):
+    ''' Generate with fixed prompt pretrained '''
+    prompt = f"“{input_text}”改写后是“"
+    res = []
+    counter = 0
+    tokens, attention_mask, position_ids = get_batch(
+        torch.LongTensor(tokenizer.encode(prompt)), mem_length, batch_size=1)
+    tokens, attention_mask, position_ids = tokens.cuda(
+        device), attention_mask.cuda(device), position_ids.cuda(device)
+    org_context_length = tokens.shape[-1]
+    model = model.cuda(device)
+    while counter < 100:
+        if counter == 0:
+            mems = []  # empty at the begining
+            output = model(input_ids=tokens, attention_mask=attention_mask,
+                           position_ids=position_ids, hidden_states=mems)
+            logits, mems = output.logits, output.hidden_states
+        else:
+            index = org_context_length + counter
+            output = model(input_ids=tokens[:, index - 1: index], position_ids=tokens.new_ones((1, 1)) * (index - 1),
+                           attention_mask=tokens.new_ones(1, 1, 1, mem_length + 1, device=device,
+                                                          dtype=torch.float), hidden_states=mems)
+            logits, mems = output.logits, output.hidden_states
+        logits = logits[:, -1]
+        logits /= temperature
+        logits = top_k_logits(logits, top_k=0, top_p=top_p)
+        log_probs = F.softmax(logits, dim=-1)
+        prev = torch.multinomial(log_probs, num_samples=1)[0]
+        is_end = prev == eod_token
+        if is_end:
+            break
+        tokens = torch.cat((tokens, prev.view(1, 1)), dim=1)
+        counter += 1
+    res.append(tokenizer.decode(tokens.view(-1).contiguous().tolist()))
+    return res
+
+
+if __name__ == "__main__":
+    device = 1
+    tokenizer = TransfoXLDenoiseTokenizer.from_pretrained('IDEA-CCNL/Bigan-Transformer-XL-denoise-1.1B')
+    model = TransfoXLDenoiseModel.from_pretrained('IDEA-CCNL/Bigan-Transformer-XL-denoise-1.1B')
+    input_text = "凡是有成就的人, 都很严肃地对待生命自己的"
+    res = denoise_generate(model, tokenizer, input_text)
+    print(res)
diff --git a/fengshen/models/transfo_xl_denoise/modeling_transfo_xl_denoise.py b/fengshen/models/transfo_xl_denoise/modeling_transfo_xl_denoise.py
new file mode 100644
index 0000000000000000000000000000000000000000..04fb81f284036eea063ba83c25bff981f9febbf5
--- /dev/null
+++ b/fengshen/models/transfo_xl_denoise/modeling_transfo_xl_denoise.py
@@ -0,0 +1,769 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch TransfoXLDenoise model. """
+
+
+import math
+import torch
+import torch.utils.checkpoint as checkpoint
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from transformers.modeling_utils import (
+    PreTrainedModel
+)
+from transformers.modeling_outputs import ModelOutput
+from .configuration_transfo_xl_denoise import TransfoXLDenoiseConfig
+
+
+_CHECKPOINT_FOR_DOC = "transformer-xl-1b-base"
+_CONFIG_FOR_DOC = "TransfoXLDenoiseConfig"
+_TOKENIZER_FOR_DOC = "TransfoXLDenoiseTokenizer"
+
+Transfo_XL_Denoise_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config ([`~TransfoXLDenoiseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+Transfo_XL_Denoise_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`TransfoXLDenoiseTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Transfo_XL_Denoise_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "transformer-xl-1b-base",
+]
+
+
+@dataclass
+class TransfoXLDenoiseModelOutput(ModelOutput):
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PositionalEmbedding(torch.nn.Module):
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x
+                                       * (1.0 + 0.044715 * x * x)))
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class GPT2SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None, relative_encoding=False):
+        super(GPT2SelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = hidden_size
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = num_attention_heads
+        self.relative_encoding = relative_encoding
+        # Strided linear layer.
+        self.query_key_value = torch.nn.Linear(hidden_size,
+                                               3 * hidden_size, bias=True)
+
+        if relative_encoding:
+            self.relative = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    @staticmethod
+    def _rel_shift(x, zero_triu=False):
+        # ql x kl x bsz x h
+        # bsz x h x ql x kl
+        zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
+                               device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
+
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
+
+        return x
+
+    @staticmethod
+    def _rel_shift_latest(x: torch.Tensor):
+        ndims = x.dim()
+        x_shape = x.size()
+        row_dim = 2
+        col_dim = row_dim + 1
+        assert col_dim < ndims
+        tgt_shape_1, tgt_shape_2 = [], []
+        for i in range(ndims):
+            if i == row_dim:
+                tgt_shape_1.append(x_shape[col_dim])
+                tgt_shape_2.append(x_shape[row_dim])
+            elif i == col_dim:
+                tgt_shape_1.append(x_shape[row_dim])
+                tgt_shape_2.append(x_shape[col_dim] - 1)
+            else:
+                tgt_shape_1.append(x_shape[i])
+                tgt_shape_2.append(x_shape[i])
+        x = x.view(*tgt_shape_1)
+        x = x[:, :, 1:, :]
+        x = x.view(*tgt_shape_2)
+        return x
+
+    def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = torch.chunk(mixed_x_layer, 3, dim=-1)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = torch.chunk(mixed_x_layer, 3, dim=-1)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.relative_encoding:
+            relative_layer = self.relative(position_embeddings)
+            relative_layer = self._transpose_for_scores(
+                relative_layer)  # 1 (bsz) x n_head x klen x d_head
+            # Raw attention scores. [b, np, qs, ks]
+            rw_head_q = query_layer + r_w_bias.unsqueeze(1)
+            ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
+            rr_head_q = query_layer + r_r_bias.unsqueeze(1)
+            bd_score = torch.matmul(rr_head_q, relative_layer.transpose(-1, -2))
+            bd_score = self._rel_shift(bd_score)  # qlen x klen x bsz x n_head
+            # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
+
+            attention_scores = ac_score + bd_score
+        else:
+            # Raw attention scores. [b, np, s, s]
+            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask) - \
+            10000.0 * (1.0 - ltor_mask)
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # with get_cuda_rng_tracker().fork():
+        #     attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+class GPT2MLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self, hidden_size, output_dropout_prob, init_method,
+                 output_layer_init_method=None):
+        super(GPT2MLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = torch.nn.Linear(hidden_size, 4 * hidden_size)
+        # Project back to h.
+        self.dense_4h_to_h = torch.nn.Linear(4 * hidden_size, hidden_size)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+
+
+class GPT2TransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False):
+        super(GPT2TransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = GPT2SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = torch.nn.LayerNorm(hidden_size,
+                                                           eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GPT2MLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(
+            layernorm_output, ltor_mask, position_embeddings, r_w_bias, r_r_bias, mem)
+        # Residual connection.
+        # print(f'hz {hidden_states.shape}, attn {attention_output.shape}')
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+class GPT2Transformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(self,
+                 num_layers,
+                 hidden_size,
+                 num_attention_heads,
+                 max_sequence_length,
+                 max_memory_length,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 use_scaled_init_for_output_weights=True,
+                 relative_encoding=False):
+        super(GPT2Transformer, self).__init__()
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(init_method_std,
+                                                          num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.relative_encoding = relative_encoding
+        if relative_encoding:
+            # Relative position embedding
+            self.position_embeddings = PositionalEmbedding(hidden_size)
+            # Per attention head and per partition values.
+            self.hidden_size_per_attention_head = divide(hidden_size,
+                                                         num_attention_heads)
+            self.num_attention_heads_per_partition = num_attention_heads
+            self.r_w_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+            self.r_r_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.r_w_bias.zero_()
+                self.r_r_bias.zero_()
+        else:
+            # Position embedding (serial).
+            self.position_embeddings = torch.nn.Embedding(max_sequence_length,
+                                                          hidden_size)
+            # Initialize the position embeddings.
+            torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+
+        def get_layer():
+            return GPT2TransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                relative_encoding=relative_encoding)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+    def forward(self, hidden_states, position_ids, attention_mask, *mems):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = mems[0].size(1) if mems else 0
+        key_length = query_length + memory_length
+        attention_mask = attention_mask[:, :, :, -query_length - memory_length:]
+        if self.relative_encoding:
+            # why drop twice here
+            # hidden_states = self.embedding_dropout(hidden_states)
+            position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device,
+                                             dtype=hidden_states.dtype)
+            position_embeddings = self.position_embeddings(position_sequence)
+            # Apply dropout
+            position_embeddings = self.embedding_dropout(position_embeddings)
+            hidden_states = self.embedding_dropout(hidden_states)
+        else:
+            position_embeddings = self.position_embeddings(position_ids)
+            hidden_states = hidden_states + position_embeddings
+            hidden_states = self.embedding_dropout(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = [hidden_states.detach()]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+                if self.relative_encoding:
+                    inputs, mems_ = inputs[:4], inputs[4:]
+                else:
+                    inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0:
+                        mem_layers.append(x_.detach())
+                return x_
+            return custom_forward
+
+        if self.checkpoint_activations:
+            la = 0
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while la < num_layers:
+                args = [hidden_states, attention_mask]
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                if mems:
+                    args += mems[la: la + chunk_length]
+                hidden_states = checkpoint(custom(la, la + chunk_length), *args)
+                la += chunk_length
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask]
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                mem_i = mems[i] if mems else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0:
+                    mem_layers.append(hidden_states.detach())
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = self.update_mems(mem_layers, mems)
+
+        return (output, *mem_layers)
+
+    def update_mems(self, hiddens, mems):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = min(self.max_memory_length, memory_length + query_length)
+        new_mems = []
+        with torch.no_grad():
+            for i in range(len(hiddens)):
+                if new_memory_length <= query_length:
+                    new_mems.append(hiddens[i][:, -new_memory_length:])
+                else:
+                    new_mems.append(
+                        torch.cat(
+                            (mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1))
+        return new_mems
+
+
+class TransfoXLDenoisePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = TransfoXLDenoiseConfig
+    base_model_prefix = "transfo_xl_denoise"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        pass  # to bypass the not implement error
+
+
+class TransfoXLDenoiseModel(TransfoXLDenoisePreTrainedModel):
+    """GPT-2 Language model.
+
+    The output of the forward method are the logits (parallel or
+    serial depending on the `parallel_output` flag.
+    """
+
+    def __init__(self, config: TransfoXLDenoiseConfig):
+        super().__init__(config)
+        self.config = config
+        # Word embeddings (parallel).
+        self.word_embeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size)
+        # Transformer
+        self.transformer = GPT2Transformer(config.num_layers,
+                                           config.hidden_size,
+                                           config.num_attention_heads,
+                                           config.max_sequence_length,
+                                           config.max_memory_length,
+                                           config.embedding_dropout_prob,
+                                           config.attention_dropout_prob,
+                                           config.output_dropout_prob,
+                                           config.checkpoint_activations,
+                                           config.checkpoint_num_layers,
+                                           relative_encoding=config.relative_encoding)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        hidden_states=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **unused,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with
+        each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            decoding (see `past_key_values`).
+        """
+        # Embeddings.
+        # one-hot batch_size * seq_len * vocab_size, can use gradient
+        # if input_ids.shape[-1] == self.word_embeddings.weight.shape[0]:
+        #    words_embeddings = torch.einsum("ijk,kl->ijl", input_ids, self.word_embeddings.weight)
+        # else:
+        #    print(f'input_ids {input_ids.device}, word_embedding {self.word_embeddings.weight.device}')
+        #    words_embeddings = self.word_embeddings(input_ids)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        assert input_ids is not None and attention_mask is not None and position_ids is not None, \
+            "You have to specify input_ids, attention_mask, and position_ids. Check tokenizer.encode_plus for details"
+        if not hidden_states:
+            hidden_states = []
+        embeddings = self.word_embeddings(input_ids)
+
+        # Transformer.
+        transformer_output = self.transformer(
+            embeddings, position_ids, attention_mask, *hidden_states)
+        logits, *hidden_states = transformer_output
+        logits = F.linear(logits, self.word_embeddings.weight)
+
+        if not return_dict:
+            return logits, hidden_states
+
+        return TransfoXLDenoiseModelOutput(
+            logits=logits,
+            hidden_states=hidden_states
+        )
diff --git a/fengshen/models/transfo_xl_denoise/tokenization_transfo_xl_denoise.py b/fengshen/models/transfo_xl_denoise/tokenization_transfo_xl_denoise.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b454c8cc236a114074c8a099878f8e464f87ad5
--- /dev/null
+++ b/fengshen/models/transfo_xl_denoise/tokenization_transfo_xl_denoise.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for TransfoXLDenoise."""
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "transformer-xl-1b-base":
+            "https://huggingface.co./IDEA-CCNL/Bigan-Transformer-XL-denoise-1.1B/resolve/main/spiece.model",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "transformer-xl-1b-base": 512,
+}
+
+
+class TransfoXLDenoiseTokenizer(PreTrainedTokenizer):
+    """
+    Construct a TransfoXLDenoise tokenizer. Based on pretrained sentence piece
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    SPIECE_UNDERLINE = "▁"
+
+    def __init__(
+            self,
+            vocab_file,
+            unk_token="<|endoftext|>",
+            bos_token="<|endoftext|>",
+            eos_token="<|endoftext|>",
+            **kwargs
+    ):
+        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+        "Initialisation"
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        "Returns vocab size"
+        return len(self.sp_model)
+
+    def _tokenize(self, text):
+        """ Returns a tokenized string. """
+        return self.sp_model.EncodeAsPieces(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = "".join(tokens).replace(self.SPIECE_UNDERLINE, " ").strip()
+        return out_string
diff --git a/fengshen/models/transfo_xl_paraphrase/__init__.py b/fengshen/models/transfo_xl_paraphrase/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eb10eb65d1b0c4da740e22fcba4e19461121f20
--- /dev/null
+++ b/fengshen/models/transfo_xl_paraphrase/__init__.py
@@ -0,0 +1,2 @@
+from fengshen.models.transfo_xl_denoise.modeling_transfo_xl_denoise import TransfoXLDenoiseModel as TransfoXLModel
+from .generate import paraphrase_generate
diff --git a/fengshen/models/transfo_xl_paraphrase/generate.py b/fengshen/models/transfo_xl_paraphrase/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..583ae41fc82dcf6c22442a3c55d8384263f7d92a
--- /dev/null
+++ b/fengshen/models/transfo_xl_paraphrase/generate.py
@@ -0,0 +1,69 @@
+import torch
+import torch.nn.functional as F
+from fengshen.models.transfo_xl_paraphrase import TransfoXLModel
+from fengshen.utils import top_k_logits, get_masks_and_position_ids
+from transformers import T5Tokenizer
+
+
+def get_batch(context_tokens, mem_length, batch_size=1):
+    tokens = context_tokens
+    tokens = tokens.view(batch_size, -1).contiguous()
+    # Get the masks and postition ids.
+    attention_mask, position_ids = get_masks_and_position_ids(tokens, mem_length=mem_length)
+    return tokens, attention_mask, position_ids
+
+
+def paraphrase_generate(model,
+                     tokenizer,
+                     input_text,
+                     device=0,
+                     mem_length=512,
+                     temperature=1.,
+                     top_p=0.9,
+                     eod_token=50000):
+    ''' Generate with fixed prompt pretrained '''
+    prompt = f"“{input_text}”的相似句是“"
+    counter = 0
+    prompt_tokens = tokenizer.encode(prompt)[:-1]
+    tokens, attention_mask, position_ids = get_batch(
+        torch.LongTensor(prompt_tokens), mem_length, batch_size=1)
+    tokens, attention_mask, position_ids = tokens.cuda(
+        device), attention_mask.cuda(device), position_ids.cuda(device)
+    org_context_length = tokens.shape[-1]
+    model = model.cuda(device)
+    while counter < 100:
+        if counter == 0:
+            mems = []  # empty at the begining
+            output = model(input_ids=tokens, attention_mask=attention_mask,
+                           position_ids=position_ids, hidden_states=mems)
+            logits, mems = output.logits, output.hidden_states
+        else:
+            index = org_context_length + counter
+            output = model(input_ids=tokens[:, index - 1: index], position_ids=tokens.new_ones((1, 1)) * (index - 1),
+                           attention_mask=tokens.new_ones(1, 1, 1, mem_length + 1, device=device,
+                                                          dtype=torch.float), hidden_states=mems)
+            logits, mems = output.logits, output.hidden_states
+        logits = logits[:, -1]
+        logits /= temperature
+        logits = top_k_logits(logits, top_k=0, top_p=top_p)
+        log_probs = F.softmax(logits, dim=-1)
+        prev = torch.multinomial(log_probs, num_samples=1)[0]
+        is_end = prev == eod_token
+        if is_end:
+            break
+        tokens = torch.cat((tokens, prev.view(1, 1)), dim=1)
+        counter += 1
+    out_tokens = tokens.view(-1).contiguous().tolist()[len(prompt_tokens):]
+    res = tokenizer.decode(out_tokens).split('”')[0]
+    return res
+
+
+if __name__ == "__main__":
+    device = 0
+    tokenizer = T5Tokenizer.from_pretrained('IDEA-CCNL/Randeng-TransformerXL-1.1B-Paraphrasing-Chinese',
+                                           eos_token='<|endoftext|>',
+                                           extra_ids=0)
+    model = TransfoXLModel.from_pretrained('IDEA-CCNL/Randeng-TransformerXL-1.1B-Paraphrasing-Chinese')
+    input_text = "年轻教师选择农村学校，还是县城学校？"
+    res = paraphrase_generate(model, tokenizer, input_text, device=device)
+    print(res)
diff --git a/fengshen/models/transfo_xl_reasoning/__init__.py b/fengshen/models/transfo_xl_reasoning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c071fa45cfa595933f14cdd86f10541600f46bc
--- /dev/null
+++ b/fengshen/models/transfo_xl_reasoning/__init__.py
@@ -0,0 +1,3 @@
+# encoding=utf-8
+from fengshen.models.transfo_xl_denoise.modeling_transfo_xl_denoise import TransfoXLDenoiseModel as TransfoXLModel
+from .generate import deduction_generate, abduction_generate
\ No newline at end of file
diff --git a/fengshen/models/transfo_xl_reasoning/generate.py b/fengshen/models/transfo_xl_reasoning/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..af25da3dcc78cce9705d578ee45e1f555c9d27b2
--- /dev/null
+++ b/fengshen/models/transfo_xl_reasoning/generate.py
@@ -0,0 +1,120 @@
+# encoding=utf-8
+from typing import List, Union
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from transformers import T5Tokenizer
+
+from fengshen.models.transfo_xl_reasoning import TransfoXLModel
+from fengshen.utils import sample_sequence_batch
+
+
+def en_to_zh(sentence:str):
+    en_pun = u",.!?[]()<>\"\"''"
+    zh_pun = u"，。！？【】（）《》“”‘’"
+    table = {
+        ord(f): ord(t) for f,t in zip(en_pun, zh_pun)
+    }
+    return sentence.translate(table)
+
+
+def deduction_generate(
+    model:TransfoXLModel,
+    tokenizer:T5Tokenizer,
+    input_text:Union[str, List[str]],
+    device:int=0,
+    batch_size:int=2,
+    temperature:float=1.0,
+    repetition_penalty:float=2.0,
+    max_out_seq:int=512,
+    top_p:float=0.6) -> List[str]:
+    """ Generate with fixed prompt of deduction """
+
+    model = model.eval().cuda(device)
+    
+    if isinstance(input_text, str):
+        input_text = [input_text]
+
+    input_text = [f"<bos>{text}，因而" for text in input_text]
+
+    input_ids = [torch.tensor(ids[:-1]) for ids in tokenizer(input_text).input_ids]
+    input_length = [len(ids) for ids in input_ids]
+
+    output = []
+
+    for index in range(0, len(input_ids), batch_size):
+        input_ids_batch = pad_sequence(
+            input_ids[index: index + batch_size], batch_first=True, padding_value=50000,
+        )
+        input_ids_length = torch.tensor(input_length[index: index + batch_size])
+
+        res_ids_batch, _ = sample_sequence_batch(
+            model=model,
+            context_tokens_tensor=input_ids_batch.cuda(device=device),
+            context_length_tensor=input_ids_length.cuda(device=device),
+            end_token_id=50000,
+            top_k=0, top_p=top_p,
+            max_out_seq=max_out_seq,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature
+        )
+
+        res_sentence = [
+            en_to_zh(tokenizer.decode(ids[length:])).replace(" ", "")
+            for ids, length in zip(res_ids_batch, input_length[index: index + batch_size])
+        ]
+
+        output.extend(res_sentence)
+
+    return output
+
+
+def abduction_generate(
+    model:TransfoXLModel,
+    tokenizer:T5Tokenizer,
+    input_text:Union[str, List[str]],
+    device:int=0,
+    batch_size:int=2,
+    temperature:float=1.0,
+    repetition_penalty:float=2.0,
+    top_p:float=0.6) -> List[str]:
+    """ Generate with fixed prompt of abduction """
+
+    model = model.eval().cuda(device)
+
+    if isinstance(input_text, str):
+        input_text = [input_text]
+
+    input_text = [f"<bos>之所以{text}，是因为" for text in input_text]
+
+    input_ids = [torch.tensor(ids[:-1]) for ids in tokenizer(input_text).input_ids]
+    input_length = [len(ids) for ids in input_ids]
+
+    output = []
+
+    for index in range(0, len(input_ids), batch_size):
+        input_ids_batch = pad_sequence(
+            input_ids[index: index + batch_size], batch_first=True, padding_value=50000,
+        )
+        input_ids_length = torch.tensor(input_length[index: index + batch_size])
+
+        res_ids_batch, _ = sample_sequence_batch(
+            model=model,
+            context_tokens_tensor=input_ids_batch.cuda(device=device),
+            context_length_tensor=input_ids_length.cuda(device=device),
+            end_token_id=50000,
+            top_k=0, top_p=top_p,
+            max_out_seq=512,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature
+        )
+
+        res_sentence = [
+            en_to_zh(tokenizer.decode(ids[length:])).replace(" ", "")
+            for ids, length in zip(res_ids_batch, input_length[index: index + batch_size])
+        ]
+
+        output.extend(res_sentence)
+
+    return output
+
diff --git a/fengshen/models/transformer_utils.py b/fengshen/models/transformer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fengshen/models/ubert/__init__.py b/fengshen/models/ubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b600c93448a1f6872c77b9bc7fea7f04bed980
--- /dev/null
+++ b/fengshen/models/ubert/__init__.py
@@ -0,0 +1,17 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .modeling_ubert import UbertPipelines, UbertModel, UbertDataset
diff --git a/fengshen/models/ubert/modeling_ubert.py b/fengshen/models/ubert/modeling_ubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb645745db7bb9432d11661fbf44fd55e2fc244
--- /dev/null
+++ b/fengshen/models/ubert/modeling_ubert.py
@@ -0,0 +1,758 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import basicConfig, setLogRecordFactory
+import torch
+from torch import nn
+import json
+from tqdm import tqdm
+import os
+import numpy as np
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    BertTokenizer,
+    file_utils
+)
+import pytorch_lightning as pl
+
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import trainer, loggers
+from torch.utils.data import Dataset, DataLoader
+from transformers.optimization import get_linear_schedule_with_warmup
+from transformers import BertForPreTraining, BertForMaskedLM, BertModel
+from transformers import BertConfig, BertForTokenClassification, BertPreTrainedModel
+import transformers
+import unicodedata
+import re
+import argparse
+
+
+transformers.logging.set_verbosity_error()
+# os.environ["CUDA_VISIBLE_DEVICES"] = '6'
+
+
+def search(pattern, sequence):
+    n = len(pattern)
+    res = []
+    for i in range(len(sequence)):
+        if sequence[i:i + n] == pattern:
+            res.append([i, i + n-1])
+    return res
+
+
+class UbertDataset(Dataset):
+    def __init__(self, data, tokenizer, args, used_mask=True):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.max_length = args.max_length
+        self.num_labels = args.num_labels
+        self.used_mask = used_mask
+        self.data = data
+        self.args = args
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.encode(self.data[index], self.used_mask)
+
+    def encode(self, item, used_mask=False):
+        input_ids1 = []
+        attention_mask1 = []
+        token_type_ids1 = []
+        span_labels1 = []
+        span_labels_masks1 = []
+
+        input_ids0 = []
+        attention_mask0 = []
+        token_type_ids0 = []
+        span_labels0 = []
+        span_labels_masks0 = []
+
+        subtask_type = item['subtask_type']
+        for choice in item['choices']:
+            try:
+                texta = item['task_type'] + '[SEP]' + \
+                    subtask_type + '[SEP]' + choice['entity_type']
+                textb = item['text']
+                encode_dict = self.tokenizer.encode_plus(texta, textb,
+                                                         max_length=self.max_length,
+                                                         padding='max_length',
+                                                         truncation='longest_first')
+
+                encode_sent = encode_dict['input_ids']
+                encode_token_type_ids = encode_dict['token_type_ids']
+                encode_attention_mask = encode_dict['attention_mask']
+                span_label = np.zeros((self.max_length, self.max_length))
+                span_label_mask = np.zeros(
+                    (self.max_length, self.max_length))-10000
+
+                if item['task_type'] == '分类任务':
+                    span_label_mask[0, 0] = 0
+                    span_label[0, 0] = choice['label']
+
+                else:
+                    question_len = len(self.tokenizer.encode(texta))
+                    span_label_mask[question_len:, question_len:] = np.zeros(
+                        (self.max_length-question_len, self.max_length-question_len))
+                    for entity in choice['entity_list']:
+                        # if 'entity_name' in entity.keys() and entity['entity_name']=='':
+                        #     continue
+                        entity_idx_list = entity['entity_idx']
+                        if entity_idx_list == []:
+                            continue
+                        for entity_idx in entity_idx_list:
+                            if entity_idx == []:
+                                continue
+                            start_idx_text = item['text'][:entity_idx[0]]
+                            start_idx_text_encode = self.tokenizer.encode(
+                                start_idx_text, add_special_tokens=False)
+                            start_idx = question_len + \
+                                len(start_idx_text_encode)
+
+                            end_idx_text = item['text'][:entity_idx[1]+1]
+                            end_idx_text_encode = self.tokenizer.encode(
+                                end_idx_text, add_special_tokens=False)
+                            end_idx = question_len + \
+                                len(end_idx_text_encode) - 1
+                            if start_idx < self.max_length and end_idx < self.max_length:
+                                span_label[start_idx, end_idx] = 1
+
+                if np.sum(span_label) < 1:
+                    input_ids0.append(encode_sent)
+                    attention_mask0.append(encode_attention_mask)
+                    token_type_ids0.append(encode_token_type_ids)
+                    span_labels0.append(span_label)
+                    span_labels_masks0.append(span_label_mask)
+                else:
+                    input_ids1.append(encode_sent)
+                    attention_mask1.append(encode_attention_mask)
+                    token_type_ids1.append(encode_token_type_ids)
+                    span_labels1.append(span_label)
+                    span_labels_masks1.append(span_label_mask)
+            except:
+                print(item)
+                print(texta)
+                print(textb)
+
+        randomize = np.arange(len(input_ids0))
+        np.random.shuffle(randomize)
+        cur = 0
+        count = len(input_ids1)
+        while count < self.args.num_labels:
+            if cur < len(randomize):
+                input_ids1.append(input_ids0[randomize[cur]])
+                attention_mask1.append(attention_mask0[randomize[cur]])
+                token_type_ids1.append(token_type_ids0[randomize[cur]])
+                span_labels1.append(span_labels0[randomize[cur]])
+                span_labels_masks1.append(span_labels_masks0[randomize[cur]])
+                cur += 1
+            count += 1
+
+        while len(input_ids1) < self.args.num_labels:
+            input_ids1.append([0]*self.max_length)
+            attention_mask1.append([0]*self.max_length)
+            token_type_ids1.append([0]*self.max_length)
+            span_labels1.append(np.zeros((self.max_length, self.max_length)))
+            span_labels_masks1.append(
+                np.zeros((self.max_length, self.max_length))-10000)
+
+        input_ids = input_ids1[:self.args.num_labels]
+        attention_mask = attention_mask1[:self.args.num_labels]
+        token_type_ids = token_type_ids1[:self.args.num_labels]
+        span_labels = span_labels1[:self.args.num_labels]
+        span_labels_masks = span_labels_masks1[:self.args.num_labels]
+
+        span_labels = np.array(span_labels)
+        span_labels_masks = np.array(span_labels_masks)
+        if np.sum(span_labels) < 1:
+            span_labels[-1, -1, -1] = 1
+            span_labels_masks[-1, -1, -1] = 10000
+
+        sample = {
+            "input_ids": torch.tensor(input_ids).long(),
+            "token_type_ids": torch.tensor(token_type_ids).long(),
+            "attention_mask": torch.tensor(attention_mask).float(),
+            "span_labels": torch.tensor(span_labels).float(),
+            "span_labels_mask": torch.tensor(span_labels_masks).float()
+        }
+
+        return sample
+
+
+class UbertDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--batchsize', default=8, type=int)
+        parser.add_argument('--max_length', default=128, type=int)
+        return parent_args
+
+    def __init__(self, train_data, val_data, tokenizer, args):
+        super().__init__()
+        self.batchsize = args.batchsize
+
+        self.train_data = UbertDataset(train_data, tokenizer, args, True)
+        self.valid_data = UbertDataset(val_data, tokenizer, args, False)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.batchsize, pin_memory=False)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, batch_size=self.batchsize, pin_memory=False)
+
+
+class biaffine(nn.Module):
+    def __init__(self, in_size, out_size, bias_x=True, bias_y=True):
+        super().__init__()
+        self.bias_x = bias_x
+        self.bias_y = bias_y
+        self.out_size = out_size
+        self.U = torch.nn.Parameter(torch.zeros(
+            in_size + int(bias_x), out_size, in_size + int(bias_y)))
+        torch.nn.init.normal_(self.U, mean=0, std=0.1)
+
+    def forward(self, x, y):
+        if self.bias_x:
+            x = torch.cat((x, torch.ones_like(x[..., :1])), dim=-1)
+        if self.bias_y:
+            y = torch.cat((y, torch.ones_like(y[..., :1])), dim=-1)
+        bilinar_mapping = torch.einsum('bxi,ioj,byj->bxyo', x, self.U, y)
+        return bilinar_mapping
+
+
+class MultilabelCrossEntropy(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, y_pred, y_true):
+        y_true = y_true.float()
+        y_pred = torch.mul((1.0 - torch.mul(y_true, 2.0)), y_pred)
+        y_pred_neg = y_pred - torch.mul(y_true, 1e12)
+        y_pred_pos = y_pred - torch.mul(1.0 - y_true, 1e12)
+        zeros = torch.zeros_like(y_pred[..., :1])
+        y_pred_neg = torch.cat([y_pred_neg, zeros], axis=-1)
+        y_pred_pos = torch.cat([y_pred_pos, zeros], axis=-1)
+        neg_loss = torch.logsumexp(y_pred_neg, axis=-1)
+        pos_loss = torch.logsumexp(y_pred_pos, axis=-1)
+        loss = torch.mean(neg_loss + pos_loss)
+        return loss
+
+
+class UbertModel(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.query_layer = torch.nn.Sequential(torch.nn.Linear(in_features=self.config.hidden_size,
+                                                               out_features=self.config.biaffine_size),
+                                               torch.nn.GELU())
+        self.key_layer = torch.nn.Sequential(torch.nn.Linear(in_features=self.config.hidden_size, out_features=self.config.biaffine_size),
+                                             torch.nn.GELU())
+        self.biaffine_query_key_cls = biaffine(self.config.biaffine_size, 1)
+        self.loss_softmax = MultilabelCrossEntropy()
+        self.loss_sigmoid = torch.nn.BCEWithLogitsLoss(reduction='mean')
+
+    def forward(self,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                span_labels=None,
+                span_labels_mask=None):
+
+        batch_size, num_label, seq_len = input_ids.shape
+
+        input_ids = input_ids.view(-1, seq_len)
+        attention_mask = attention_mask.view(-1, seq_len)
+        token_type_ids = token_type_ids.view(-1, seq_len)
+
+        batch_size, seq_len = input_ids.shape
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            output_hidden_states=True)  # (bsz, seq, dim)
+
+        hidden_states = outputs[0]
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        query = self.query_layer(hidden_states)
+        key = self.key_layer(hidden_states)
+
+        span_logits = self.biaffine_query_key_cls(
+            query, key).reshape(-1, num_label, seq_len, seq_len)
+
+        span_logits = span_logits + span_labels_mask
+
+        if span_labels == None:
+            return 0, span_logits
+        else:
+            soft_loss1 = self.loss_softmax(
+                span_logits.reshape(-1, num_label, seq_len*seq_len), span_labels.reshape(-1, num_label, seq_len*seq_len))
+            soft_loss2 = self.loss_softmax(span_logits.permute(
+                0, 2, 3, 1), span_labels.permute(0, 2, 3, 1))
+            sig_loss = self.loss_sigmoid(span_logits, span_labels)
+            all_loss = 10*(100*sig_loss+soft_loss1+soft_loss2)
+            return all_loss, span_logits
+
+
+class UbertLitModel(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--learning_rate', default=1e-5, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        parser.add_argument('--num_labels', default=10, type=int)
+
+        return parent_args
+
+    def __init__(self, args, num_data=1):
+        super().__init__()
+        self.args = args
+        self.num_data = num_data
+        self.model = UbertModel.from_pretrained(
+            self.args.pretrained_model_path)
+        self.count = 0
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
+            self.total_step = int(self.trainer.max_epochs * self.num_data /
+                                  (max(1, num_gpus) * self.trainer.accumulate_grad_batches))
+            print('Total training step:', self.total_step)
+
+    def training_step(self, batch, batch_idx):
+        loss, span_logits = self.model(**batch)
+        span_acc, recall, precise = self.comput_metrix_span(
+            span_logits, batch['span_labels'])
+        self.log('train_loss', loss)
+        self.log('train_span_acc', span_acc)
+        self.log('train_span_recall', recall)
+        self.log('train_span_precise', precise)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        loss, span_logits = self.model(**batch)
+        span_acc, recall, precise = self.comput_metrix_span(
+            span_logits, batch['span_labels'])
+
+        self.log('val_loss', loss)
+        self.log('val_span_acc', span_acc)
+        self.log('val_span_recall', recall)
+        self.log('val_span_precise', precise)
+
+    def predict_step(self, batch, batch_idx):
+        loss, span_logits = self.model(**batch)
+        span_acc = self.comput_metrix_span(span_logits, batch['span_labels'])
+        return span_acc.item()
+
+    def configure_optimizers(self):
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.args.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_step * self.args.warmup),
+            self.total_step)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+    def comput_metrix_span(self, logits, labels):
+        ones = torch.ones_like(logits)
+        zero = torch.zeros_like(logits)
+        logits = torch.where(logits < 0, zero, ones)
+        y_pred = logits.view(size=(-1,))
+        y_true = labels.view(size=(-1,))
+        corr = torch.eq(y_pred, y_true).float()
+        corr = torch.multiply(y_true, corr)
+        recall = torch.sum(corr.float())/(torch.sum(y_true.float())+1e-5)
+        precise = torch.sum(corr.float())/(torch.sum(y_pred.float())+1e-5)
+        f1 = 2*recall*precise/(recall+precise+1e-5)
+        return f1, recall, precise
+
+
+class TaskModelCheckpoint:
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--monitor', default='train_loss', type=str)
+        parser.add_argument('--mode', default='min', type=str)
+        parser.add_argument('--checkpoint_path',
+                            default='./checkpoint/', type=str)
+        parser.add_argument(
+            '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
+
+        parser.add_argument('--save_top_k', default=3, type=float)
+        parser.add_argument('--every_n_epochs', default=1, type=float)
+        parser.add_argument('--every_n_train_steps', default=100, type=float)
+
+        parser.add_argument('--save_weights_only', default=True, type=bool)
+        return parent_args
+
+    def __init__(self, args):
+        self.callbacks = ModelCheckpoint(monitor=args.monitor,
+                                         save_top_k=args.save_top_k,
+                                         mode=args.mode,
+                                         save_last=True,
+                                         every_n_train_steps=args.every_n_train_steps,
+                                         save_weights_only=args.save_weights_only,
+                                         dirpath=args.checkpoint_path,
+                                         filename=args.filename)
+
+
+class OffsetMapping:
+    def __init__(self):
+        self._do_lower_case = True
+
+    @staticmethod
+    def stem(token):
+        if token[:2] == '##':
+            return token[2:]
+        else:
+            return token
+
+    @staticmethod
+    def _is_control(ch):
+        return unicodedata.category(ch) in ('Cc', 'Cf')
+
+    @staticmethod
+    def _is_special(ch):
+        return bool(ch) and (ch[0] == '[') and (ch[-1] == ']')
+
+    def rematch(self, text, tokens):
+        if self._do_lower_case:
+            text = text.lower()
+
+        normalized_text, char_mapping = '', []
+        for i, ch in enumerate(text):
+            if self._do_lower_case:
+                ch = unicodedata.normalize('NFD', ch)
+                ch = ''.join(
+                    [c for c in ch if unicodedata.category(c) != 'Mn'])
+            ch = ''.join([
+                c for c in ch
+                if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c))
+            ])
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+        for token in tokens:
+            if self._is_special(token):
+                token_mapping.append([offset])
+                offset += 1
+            else:
+                token = self.stem(token)
+                start = text[offset:].index(token) + offset
+                end = start + len(token)
+                token_mapping.append(char_mapping[start:end])
+                offset = end
+
+        return token_mapping
+
+
+class extractModel:
+    '''
+    # 在我目前提交的这一版程序中，这个方法已经不再需要被调用了。
+    def get_actual_id(self, text, query_text, tokenizer, args):
+        text_encode = tokenizer.encode(text)
+        one_input_encode = tokenizer.encode(query_text)
+        text_start_id = search(text_encode[1:-1], one_input_encode)[0][0]
+        text_end_id = text_start_id+len(text_encode)-1
+        if text_end_id > args.max_length:
+            text_end_id = args.max_length
+
+        text_token = tokenizer.tokenize(text)
+        text_mapping = OffsetMapping().rematch(text, text_token)
+
+        return text_start_id, text_end_id, text_mapping, one_input_encode
+    '''
+
+    def extract_index(self, span_logits, sample_length, split_value=0.5):
+        result = []
+        for i in range(sample_length):
+            for j in range(i, sample_length):
+                if span_logits[i, j] > split_value:
+                    result.append((i, j, span_logits[i, j]))
+        return result
+
+    def extract_entity(self, text, entity_idx, text_start_id, text_mapping):
+        start_split = text_mapping[entity_idx[0]-text_start_id] if entity_idx[0] - \
+            text_start_id < len(text_mapping) and entity_idx[0]-text_start_id >= 0 else []
+        end_split = text_mapping[entity_idx[1]-text_start_id] if entity_idx[1] - \
+            text_start_id < len(text_mapping) and entity_idx[1]-text_start_id >= 0 else []
+        entity = ''
+        if start_split != [] and end_split != []:
+            entity = text[start_split[0]:end_split[-1]+1]
+        return entity
+
+    def extract(self, batch_data, model, tokenizer, args):
+        input_ids = []
+        attention_mask = []
+        token_type_ids = []
+        span_labels_masks = []
+
+        for item in batch_data:
+            input_ids0 = []
+            attention_mask0 = []
+            token_type_ids0 = []
+            span_labels_masks0 = []
+            for choice in item['choices']:
+                texta = item['task_type'] + '[SEP]' + \
+                    item['subtask_type'] + '[SEP]' + choice['entity_type']
+                textb = item['text']
+                encode_dict = tokenizer.encode_plus(texta, textb,
+                                                    max_length=args.max_length,
+                                                    padding='max_length',
+                                                    truncation='longest_first')
+
+                encode_sent = encode_dict['input_ids']
+                encode_token_type_ids = encode_dict['token_type_ids']
+                encode_attention_mask = encode_dict['attention_mask']
+                span_label_mask = np.zeros(
+                    (args.max_length, args.max_length))-10000
+
+                if item['task_type'] == '分类任务':
+                    span_label_mask[0, 0] = 0
+                else:
+                    question_len = len(tokenizer.encode(texta))
+                    span_label_mask[question_len:, question_len:] = np.zeros(
+                        (args.max_length-question_len, args.max_length-question_len))
+                input_ids0.append(encode_sent)
+                attention_mask0.append(encode_attention_mask)
+                token_type_ids0.append(encode_token_type_ids)
+                span_labels_masks0.append(span_label_mask)
+
+            input_ids.append(input_ids0)
+            attention_mask.append(attention_mask0)
+            token_type_ids.append(token_type_ids0)
+            span_labels_masks.append(span_labels_masks0)
+
+        input_ids = torch.tensor(input_ids).to(model.device)
+        attention_mask = torch.tensor(attention_mask).to(model.device)
+        token_type_ids = torch.tensor(token_type_ids).to(model.device)
+        
+        # 因为原有代码会导致deprecated警告，所以修改如下：
+        span_labels_mask = torch.tensor(np.array(span_labels_masks)).to(model.device)
+
+        _, span_logits = model.model(input_ids=input_ids,
+                                     attention_mask=attention_mask,
+                                     token_type_ids=token_type_ids,
+                                     span_labels=None,
+                                     span_labels_mask=span_labels_mask)
+
+        # 因为原有代码会导致deprecated警告，所以修改如下：
+        span_logits = torch.sigmoid(span_logits)
+        span_logits = span_logits.cpu().detach().numpy()
+
+        for i, item in enumerate(batch_data):
+            if item['task_type'] == '分类任务':
+                cls_idx = 0
+                max_c = np.argmax(span_logits[i, :, cls_idx, cls_idx])
+                batch_data[i]['choices'][max_c]['label'] = 1
+                batch_data[i]['choices'][max_c]['score'] = span_logits[i,
+                                                                       max_c, cls_idx, cls_idx]
+            else:
+
+                '''
+                优化了代码效率，并修复了一些bug：
+                1.通过合理的调整程序，去掉了“text_start_id, text_end_id, offset_mapping, input_ids = self.get_actual_id(item['text'], texta+'[SEP]'+textb, tokenizer, args)”。
+                2.保证在一个item任务中，item['text']的“encode”、“tokenize”只需要执行一次，而不是像之前一样会因为item['choices']的多寡而重复执行。
+                3.修复了"抽取式阅读理解"无法在item['choices']中有多个实体的情况下，正确提取文字内容，以及提取的文字内容出现错位的问题。
+                4.在“抽取式阅读理解”任务下，增加了top_k的选项：可在预测数据的"choices"下，增加top_k属性，如：{"entity_type": "***", "top_k": 2}，若未设置top_k属性，则默认为1。
+                5.为"抽取任务"下除"抽取式阅读理解"之外的子任务，增加了“entity_name”的过滤，保证“entity_name”唯一。
+                '''
+                textb = item['text']
+                offset_mapping = OffsetMapping().rematch(textb, tokenizer.tokenize(textb))
+                
+                input_ids = tokenizer.encode('[SEP]' + textb,
+                                            max_length=args.max_length,
+                                            truncation='longest_first')
+
+                for c in range(len(item['choices'])):
+
+                    texta = item['task_type'] + '[SEP]' + item['subtask_type'] + \
+                        '[SEP]' + item['choices'][c]['entity_type']
+                    
+                    
+                    text_start_id = len(tokenizer.encode(texta))
+
+                    logits = span_logits[i, c, :, :]
+
+                    entity_name_list = []
+                    entity_list = []
+                    if item['subtask_type'] == '抽取式阅读理解':
+
+                        try:
+
+                            top_k = int(item['choices'][c]['top_k'])
+                        except KeyError:
+
+                            top_k = 1
+
+                        if( 0 >= top_k ): 
+
+                            top_k = 1
+                        
+                        _, top_indices = torch.topk(torch.flatten(torch.tensor(logits)), top_k)
+                        
+                        for top_idx in top_indices: 
+
+                            max_index = np.unravel_index(top_idx, logits.shape)
+                            
+                            if logits[max_index] > args.threshold:
+
+                                entity = self.extract_entity(
+                                    item['text'], (max_index[0], max_index[1]), text_start_id, offset_mapping)
+                                
+                                entity = {
+                                    'entity_name': entity,
+                                    'score': logits[max_index]
+                                }
+
+                                entity_list.append(entity)
+                    else: 
+
+                        sample_length = text_start_id + len(input_ids)
+                        entity_idx_type_list = self.extract_index(
+                            logits, sample_length, split_value=args.threshold)
+
+                        for entity_idx in entity_idx_type_list:
+
+                            entity = self.extract_entity(
+                                item['text'], (entity_idx[0], entity_idx[1]), text_start_id, offset_mapping)
+                            
+                            if entity not in entity_name_list:
+                                
+                                entity_name_list.append(entity)
+
+                                entity = {
+                                    'entity_name': entity,
+                                    'score': entity_idx[2]
+                                }
+                                entity_list.append(entity)
+
+                    batch_data[i]['choices'][c]['entity_list'] = entity_list
+        return batch_data
+
+
+class UbertPipelines:
+    @staticmethod
+    def pipelines_args(parent_args):
+        total_parser = parent_args.add_argument_group("pipelines args")
+        total_parser.add_argument(
+            '--pretrained_model_path', default='IDEA-CCNL/Erlangshen-Ubert-110M-Chinese', type=str)
+        total_parser.add_argument('--output_save_path',
+                                  default='./predict.json', type=str)
+
+        total_parser.add_argument('--load_checkpoints_path',
+                                  default='', type=str)
+
+        total_parser.add_argument('--max_extract_entity_number',
+                                  default=1, type=float)
+
+        total_parser.add_argument('--train', action='store_true')
+
+        total_parser.add_argument('--threshold',
+                                  default=0.5, type=float)
+
+        total_parser = UbertDataModel.add_data_specific_args(total_parser)
+        total_parser = TaskModelCheckpoint.add_argparse_args(total_parser)
+        total_parser = UbertLitModel.add_model_specific_args(total_parser)
+        total_parser = pl.Trainer.add_argparse_args(parent_args)
+
+        return parent_args
+
+    def __init__(self, args):
+
+        if args.load_checkpoints_path != '':
+            self.model = UbertLitModel.load_from_checkpoint(
+                args.load_checkpoints_path, args=args)
+        else:
+            self.model = UbertLitModel(args)
+
+        self.args = args
+        self.checkpoint_callback = TaskModelCheckpoint(args).callbacks
+        self.logger = loggers.TensorBoardLogger(save_dir=args.default_root_dir)
+        self.trainer = pl.Trainer.from_argparse_args(args,
+                                                     logger=self.logger,
+                                                     callbacks=[self.checkpoint_callback])
+
+        self.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path,
+                                                       additional_special_tokens=['[unused'+str(i+1)+']' for i in range(99)])
+
+        self.em = extractModel()
+
+    def fit(self, train_data, dev_data):
+        data_model = UbertDataModel(
+            train_data, dev_data, self.tokenizer, self.args)
+        self.model.num_data = len(train_data)
+        self.trainer.fit(self.model, data_model)
+
+    '''
+    通过增加“桶”的概念实现了，在一批预测数据的“choices”中可以存在不同数量的实体。
+    '''
+    def predict(self, predict_data, cuda=True):
+        result = []
+        start = 0
+        if cuda:
+            self.model = self.model.cuda()
+        self.model.eval()
+        while start < len(predict_data):
+            batch_data = predict_data[start:start+self.args.batchsize]
+            start += self.args.batchsize
+
+            
+            batch_data_bucket = {}
+            for item in batch_data:
+
+                choice_num = len(item['choices'])
+                
+                try:
+
+                    batch_data_bucket[choice_num].append(item)
+                except KeyError:
+
+                    batch_data_bucket[choice_num] = []
+                    batch_data_bucket[choice_num].append(item)
+
+            for k, batch_data in batch_data_bucket.items():
+
+                batch_result = self.em.extract(
+                    batch_data, self.model, self.tokenizer, self.args)
+                result.extend(batch_result)
+                
+        return result
diff --git a/fengshen/models/unimc/__init__.py b/fengshen/models/unimc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..26306d9fa6966341d2fa1878e1e13f25b0ab5d94
--- /dev/null
+++ b/fengshen/models/unimc/__init__.py
@@ -0,0 +1 @@
+from .modeling_unimc import UniMCPipelines
\ No newline at end of file
diff --git a/fengshen/models/unimc/modeling_unimc.py b/fengshen/models/unimc/modeling_unimc.py
new file mode 100644
index 0000000000000000000000000000000000000000..88c924d69dfd7b7b367e3c527135d80a6b90b2e2
--- /dev/null
+++ b/fengshen/models/unimc/modeling_unimc.py
@@ -0,0 +1,660 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import basicConfig
+import torch
+from torch import nn
+import json
+from tqdm import tqdm
+import os
+import numpy as np
+from transformers import BertTokenizer
+import pytorch_lightning as pl
+
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import trainer, loggers
+from torch.utils.data import Dataset, DataLoader
+from transformers.optimization import get_linear_schedule_with_warmup
+from transformers import BertForMaskedLM, AlbertTokenizer
+from transformers import AutoConfig
+from transformers.pipelines.base import Pipeline
+from transformers import MegatronBertForMaskedLM
+from fengshen.models.deberta_v2.modeling_deberta_v2 import DebertaV2ForMaskedLM
+from fengshen.models.albert.modeling_albert import AlbertForMaskedLM
+import argparse
+import copy
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import warnings
+from transformers import TextClassificationPipeline as HuggingfacePipe
+
+
+class UniMCDataset(Dataset):
+    def __init__(self, data, yes_token, no_token, tokenizer, args, used_mask=True):
+        super().__init__()
+
+        self.tokenizer = tokenizer
+        self.max_length = args.max_length
+        self.num_labels = args.num_labels
+        self.used_mask = used_mask
+        self.data = data
+        self.args = args
+        self.yes_token = yes_token
+        self.no_token = no_token
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.encode(self.data[index], self.used_mask)
+
+    def get_token_type(self, sep_idx, max_length):
+        token_type_ids = np.zeros(shape=(max_length,))
+        for i in range(len(sep_idx)-1):
+            if i % 2 == 0:
+                ty = np.ones(shape=(sep_idx[i+1]-sep_idx[i],))
+            else:
+                ty = np.zeros(shape=(sep_idx[i+1]-sep_idx[i],))
+            token_type_ids[sep_idx[i]:sep_idx[i+1]] = ty
+
+        return token_type_ids
+
+    def get_position_ids(self, label_idx, max_length, question_len):
+        question_position_ids = np.arange(question_len)
+        label_position_ids = np.arange(question_len, label_idx[-1])
+        for i in range(len(label_idx)-1):
+            label_position_ids[label_idx[i]-question_len:label_idx[i+1]-question_len] = np.arange(
+                question_len, question_len+label_idx[i+1]-label_idx[i])
+        max_len_label = max(label_position_ids)
+        text_position_ids = np.arange(
+            max_len_label+1, max_length+max_len_label+1-label_idx[-1])
+        position_ids = list(question_position_ids) + \
+            list(label_position_ids)+list(text_position_ids)
+        if max_length <= 512:
+            return position_ids[:max_length]
+        else:
+            for i in range(512, max_length):
+                if position_ids[i] > 511:
+                    position_ids[i] = 511
+            return position_ids[:max_length]
+
+    def get_att_mask(self, attention_mask, label_idx, question_len):
+        max_length = len(attention_mask)
+        attention_mask = np.array(attention_mask)
+        attention_mask = np.tile(attention_mask[None, :], (max_length, 1))
+
+        zeros = np.zeros(
+            shape=(label_idx[-1]-question_len, label_idx[-1]-question_len))
+        attention_mask[question_len:label_idx[-1],
+                       question_len:label_idx[-1]] = zeros
+
+        for i in range(len(label_idx)-1):
+            label_token_length = label_idx[i+1]-label_idx[i]
+            if label_token_length <= 0:
+                print('label_idx', label_idx)
+                print('question_len', question_len)
+                continue
+            ones = np.ones(shape=(label_token_length, label_token_length))
+            attention_mask[label_idx[i]:label_idx[i+1],
+                           label_idx[i]:label_idx[i+1]] = ones
+
+        return attention_mask
+
+    def random_masking(self, token_ids, maks_rate, mask_start_idx, max_length, mask_id, tokenizer):
+        rands = np.random.random(len(token_ids))
+        source, target = [], []
+        for i, (r, t) in enumerate(zip(rands, token_ids)):
+            if i < mask_start_idx:
+                source.append(t)
+                target.append(-100)
+                continue
+            if r < maks_rate * 0.8:
+                source.append(mask_id)
+                target.append(t)
+            elif r < maks_rate * 0.9:
+                source.append(t)
+                target.append(t)
+            elif r < maks_rate:
+                source.append(np.random.choice(tokenizer.vocab_size - 1) + 1)
+                target.append(t)
+            else:
+                source.append(t)
+                target.append(-100)
+        while len(source) < max_length:
+            source.append(0)
+            target.append(-100)
+        return source[:max_length], target[:max_length]
+
+    def encode(self, item, used_mask=False):
+
+        while len(self.tokenizer.encode('[MASK]'.join(item['choice']))) > self.max_length-32:
+            item['choice'] = [c[:int(len(c)/2)] for c in item['choice']]
+
+        if 'textb' in item.keys() and item['textb'] != '':
+            if 'question' in item.keys() and item['question'] != '':
+                texta = '[MASK]' + '[MASK]'.join(item['choice']) + '[SEP]' + \
+                    item['question'] + '[SEP]' + \
+                        item['texta']+'[SEP]'+item['textb']
+            else:
+                texta = '[MASK]' + '[MASK]'.join(item['choice']) + '[SEP]' + \
+                        item['texta']+'[SEP]'+item['textb']
+
+        else:
+            if 'question' in item.keys() and item['question'] != '':
+                texta = '[MASK]' + '[MASK]'.join(item['choice']) + '[SEP]' + \
+                    item['question'] + '[SEP]' + item['texta']
+            else:
+                texta = '[MASK]' + '[MASK]'.join(item['choice']) + \
+                    '[SEP]' + item['texta']
+
+        encode_dict = self.tokenizer.encode_plus(texta,
+                                                 max_length=self.max_length,
+                                                 padding='max_length',
+                                                 truncation='longest_first')
+
+        encode_sent = encode_dict['input_ids']
+        token_type_ids = encode_dict['token_type_ids']
+        attention_mask = encode_dict['attention_mask']
+        sample_max_length = sum(encode_dict['attention_mask'])
+
+        if 'label' not in item.keys():
+            item['label'] = 0
+            item['answer'] = ''
+
+        question_len = 1
+        label_idx = [question_len]
+        for choice in item['choice']:
+            cur_mask_idx = label_idx[-1] + \
+                len(self.tokenizer.encode(choice, add_special_tokens=False))+1
+            label_idx.append(cur_mask_idx)
+
+        token_type_ids = [0]*question_len+[1] * \
+            (label_idx[-1]-label_idx[0]+1)+[0]*self.max_length
+        token_type_ids = token_type_ids[:self.max_length]
+
+        attention_mask = self.get_att_mask(
+            attention_mask, label_idx, question_len)
+
+        position_ids = self.get_position_ids(
+            label_idx, self.max_length, question_len)
+
+        clslabels_mask = np.zeros(shape=(len(encode_sent),))
+        clslabels_mask[label_idx[:-1]] = 10000
+        clslabels_mask = clslabels_mask-10000
+
+        mlmlabels_mask = np.zeros(shape=(len(encode_sent),))
+        mlmlabels_mask[label_idx[0]] = 1
+
+        # used_mask=False
+        if used_mask:
+            mask_rate = 0.1*np.random.choice(4, p=[0.3, 0.3, 0.25, 0.15])
+            source, target = self.random_masking(token_ids=encode_sent, maks_rate=mask_rate,
+                                                 mask_start_idx=label_idx[-1], max_length=self.max_length,
+                                                 mask_id=self.tokenizer.mask_token_id, tokenizer=self.tokenizer)
+        else:
+            source, target = encode_sent[:], encode_sent[:]
+
+        source = np.array(source)
+        target = np.array(target)
+        source[label_idx[:-1]] = self.tokenizer.mask_token_id
+        target[label_idx[:-1]] = self.no_token
+        target[label_idx[item['label']]] = self.yes_token
+
+        input_ids = source[:sample_max_length]
+        token_type_ids = token_type_ids[:sample_max_length]
+        attention_mask = attention_mask[:sample_max_length, :sample_max_length]
+        position_ids = position_ids[:sample_max_length]
+        mlmlabels = target[:sample_max_length]
+        clslabels = label_idx[item['label']]
+        clslabels_mask = clslabels_mask[:sample_max_length]
+        mlmlabels_mask = mlmlabels_mask[:sample_max_length]
+
+        return {
+            "input_ids": torch.tensor(input_ids).long(),
+            "token_type_ids": torch.tensor(token_type_ids).long(),
+            "attention_mask": torch.tensor(attention_mask).float(),
+            "position_ids": torch.tensor(position_ids).long(),
+            "mlmlabels": torch.tensor(mlmlabels).long(),
+            "clslabels": torch.tensor(clslabels).long(),
+            "clslabels_mask": torch.tensor(clslabels_mask).float(),
+            "mlmlabels_mask": torch.tensor(mlmlabels_mask).float(),
+        }
+
+
+class UniMCDataModel(pl.LightningDataModule):
+    @staticmethod
+    def add_data_specific_args(parent_args):
+        parser = parent_args.add_argument_group('TASK NAME DataModel')
+        parser.add_argument('--num_workers', default=8, type=int)
+        parser.add_argument('--batchsize', default=16, type=int)
+        parser.add_argument('--max_length', default=512, type=int)
+        return parent_args
+
+    def __init__(self, train_data, val_data, yes_token, no_token, tokenizer, args):
+        super().__init__()
+        self.batchsize = args.batchsize
+
+        self.train_data = UniMCDataset(
+            train_data, yes_token, no_token, tokenizer, args, True)
+        self.valid_data = UniMCDataset(
+            val_data, yes_token, no_token, tokenizer, args, False)
+
+    def train_dataloader(self):
+        return DataLoader(self.train_data, shuffle=True, collate_fn=self.collate_fn, batch_size=self.batchsize, pin_memory=False)
+
+    def val_dataloader(self):
+        return DataLoader(self.valid_data, shuffle=False, collate_fn=self.collate_fn, batch_size=self.batchsize, pin_memory=False)
+
+    def collate_fn(self, batch):
+        '''
+        Aggregate a batch data.
+        batch = [ins1_dict, ins2_dict, ..., insN_dict]
+        batch_data = {'sentence':[ins1_sentence, ins2_sentence...], 'input_ids':[ins1_input_ids, ins2_input_ids...], ...}
+        '''
+        batch_data = {}
+        for key in batch[0]:
+            batch_data[key] = [example[key] for example in batch]
+
+        batch_data['input_ids'] = nn.utils.rnn.pad_sequence(batch_data['input_ids'],
+                                                            batch_first=True,
+                                                            padding_value=0)
+        batch_data['clslabels_mask'] = nn.utils.rnn.pad_sequence(batch_data['clslabels_mask'],
+                                                                 batch_first=True,
+                                                                 padding_value=-10000)
+
+        batch_size, batch_max_length = batch_data['input_ids'].shape
+        for k, v in batch_data.items():
+            if k == 'input_ids' or k == 'clslabels_mask':
+                continue
+            if k == 'clslabels':
+                batch_data[k] = torch.tensor(v).long()
+                continue
+            if k != 'attention_mask':
+                batch_data[k] = nn.utils.rnn.pad_sequence(v,
+                                                          batch_first=True,
+                                                          padding_value=0)
+            else:
+                attention_mask = torch.zeros(
+                    (batch_size, batch_max_length, batch_max_length))
+                for i, att in enumerate(v):
+                    sample_length, _ = att.shape
+                    attention_mask[i, :sample_length, :sample_length] = att
+                batch_data[k] = attention_mask
+        return batch_data
+
+
+class UniMCModel(nn.Module):
+    def __init__(self, pre_train_dir, yes_token):
+        super().__init__()
+        self.config = AutoConfig.from_pretrained(pre_train_dir)
+        if self.config.model_type == 'megatron-bert':
+            self.bert = MegatronBertForMaskedLM.from_pretrained(pre_train_dir)
+        elif self.config.model_type == 'deberta-v2':
+            self.bert = DebertaV2ForMaskedLM.from_pretrained(pre_train_dir)
+        elif self.config.model_type == 'albert':
+            self.bert = AlbertForMaskedLM.from_pretrained(pre_train_dir)
+        else:
+            self.bert = BertForMaskedLM.from_pretrained(pre_train_dir)
+
+        self.loss_func = torch.nn.CrossEntropyLoss()
+        self.yes_token = yes_token
+
+    def forward(self, input_ids, attention_mask, token_type_ids, position_ids=None, mlmlabels=None, clslabels=None, clslabels_mask=None, mlmlabels_mask=None):
+
+        batch_size, seq_len = input_ids.shape
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            position_ids=position_ids,
+                            token_type_ids=token_type_ids,
+                            labels=mlmlabels)  # (bsz, seq, dim)
+        mask_loss = outputs.loss
+        mlm_logits = outputs.logits
+        cls_logits = mlm_logits[:, :,
+                                self.yes_token].view(-1, seq_len)+clslabels_mask
+
+        if mlmlabels == None:
+            return 0, mlm_logits, cls_logits
+        else:
+            cls_loss = self.loss_func(cls_logits, clslabels)
+            all_loss = mask_loss+cls_loss
+            return all_loss, mlm_logits, cls_logits
+
+
+class UniMCLitModel(pl.LightningModule):
+
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('BaseModel')
+
+        parser.add_argument('--learning_rate', default=1e-5, type=float)
+        parser.add_argument('--weight_decay', default=0.1, type=float)
+        parser.add_argument('--warmup', default=0.01, type=float)
+        parser.add_argument('--num_labels', default=2, type=int)
+
+        return parent_args
+
+    def __init__(self, args, yes_token, model_path, num_data=100):
+        super().__init__()
+        self.args = args
+        self.num_data = num_data
+        self.model = UniMCModel(model_path, yes_token)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
+            self.total_step = int(self.trainer.max_epochs * self.num_data /
+                                  (max(1, num_gpus) * self.trainer.accumulate_grad_batches))
+            print('Total training step:', self.total_step)
+
+    def training_step(self, batch, batch_idx):
+        loss, logits, cls_logits = self.model(**batch)
+        cls_acc = self.comput_metrix(
+            cls_logits, batch['clslabels'], batch['mlmlabels_mask'])
+        self.log('train_loss', loss)
+        self.log('train_acc', cls_acc)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        loss, logits, cls_logits = self.model(**batch)
+        cls_acc = self.comput_metrix(
+            cls_logits, batch['clslabels'], batch['mlmlabels_mask'])
+        self.log('val_loss', loss)
+        self.log('val_acc', cls_acc)
+
+    def configure_optimizers(self):
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        paras = list(
+            filter(lambda p: p[1].requires_grad, self.named_parameters()))
+        paras = [{
+            'params':
+            [p for n, p in paras if not any(nd in n for nd in no_decay)],
+            'weight_decay': self.args.weight_decay
+        }, {
+            'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0
+        }]
+        optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, int(self.total_step * self.args.warmup),
+            self.total_step)
+
+        return [{
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1
+            }
+        }]
+
+    def comput_metrix(self, logits, labels, mlmlabels_mask):
+        logits = torch.nn.functional.softmax(logits, dim=-1)
+        logits = torch.argmax(logits, dim=-1)
+        y_pred = logits.view(size=(-1,))
+        y_true = labels.view(size=(-1,))
+        corr = torch.eq(y_pred, y_true).float()
+        return torch.sum(corr.float())/labels.size(0)
+
+
+class UniMCPredict:
+    def __init__(self, yes_token, no_token, model, tokenizer, args):
+        self.tokenizer = tokenizer
+        self.args = args
+        self.data_model = UniMCDataModel(
+            [], [], yes_token, no_token, tokenizer, args)
+        self.model = model
+
+    def predict(self, batch_data):
+        batch = [self.data_model.train_data.encode(
+            sample) for sample in batch_data]
+        batch = self.data_model.collate_fn(batch)
+        batch = {k: v.cuda() for k, v in batch.items()}
+        _, _, logits = self.model.model(**batch)
+        soft_logits = torch.nn.functional.softmax(logits, dim=-1)
+        logits = torch.argmax(soft_logits, dim=-1).detach().cpu().numpy()
+
+        soft_logits = soft_logits.detach().cpu().numpy()
+        clslabels_mask = batch['clslabels_mask'].detach(
+        ).cpu().numpy().tolist()
+        clslabels = batch['clslabels'].detach().cpu().numpy().tolist()
+        for i, v in enumerate(batch_data):
+            label_idx = [idx for idx, v in enumerate(
+                clslabels_mask[i]) if v == 0.]
+            label = label_idx.index(logits[i])
+            answer = batch_data[i]['choice'][label]
+            score = {}
+            for c in range(len(batch_data[i]['choice'])):
+                score[batch_data[i]['choice'][c]] = float(
+                    soft_logits[i][label_idx[c]])
+
+            batch_data[i]['label_ori'] = copy.deepcopy(batch_data[i]['label'])
+            batch_data[i]['label'] = label
+            batch_data[i]['answer'] = answer
+            batch_data[i]['score'] = score
+
+        return batch_data
+
+
+class UniMCPipelines(Pipeline):
+    @staticmethod
+    def piplines_args(parent_args):
+        total_parser = parent_args.add_argument_group("piplines args")
+        total_parser.add_argument(
+            '--pretrained_model_path', default='', type=str)
+        total_parser.add_argument('--load_checkpoints_path',
+                                  default='', type=str)
+        total_parser.add_argument('--train', action='store_true')
+        total_parser.add_argument('--language',
+                                  default='chinese', type=str)
+
+        total_parser = UniMCDataModel.add_data_specific_args(total_parser)
+        total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+        total_parser = UniMCLitModel.add_model_specific_args(total_parser)
+        total_parser = pl.Trainer.add_argparse_args(parent_args)
+        return parent_args
+
+    def __init__(self, args, model_path):
+        self.args = args
+        self.checkpoint_callback = UniversalCheckpoint(args).callbacks
+        self.logger = loggers.TensorBoardLogger(save_dir=args.default_root_dir)
+        self.trainer = pl.Trainer.from_argparse_args(args,
+                                                     logger=self.logger,
+                                                     callbacks=[self.checkpoint_callback])
+        self.config = AutoConfig.from_pretrained(model_path)
+        if self.config.model_type == 'albert':
+            self.tokenizer = AlbertTokenizer.from_pretrained(
+                model_path)
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(
+                model_path)
+
+        if args.language == 'chinese':
+            self.yes_token = self.tokenizer.encode('是')[1]
+            self.no_token = self.tokenizer.encode('非')[1]
+        else:
+            self.yes_token = self.tokenizer.encode('yes')[1]
+            self.no_token = self.tokenizer.encode('no')[1]
+
+        if args.load_checkpoints_path != '':
+            self.model = UniMCLitModel.load_from_checkpoint(
+                args.load_checkpoints_path, args=args, yes_token=self.yes_token, model_path=model_path)
+            print('load model from: ', args.load_checkpoints_path)
+        else:
+            self.model = UniMCLitModel(
+                args, yes_token=self.yes_token, model_path=model_path)
+
+    def train(self, train_data, dev_data, process=True):
+        if process:
+            train_data = self.preprocess(train_data)
+            dev_data = self.preprocess(dev_data)
+        data_model = UniMCDataModel(
+            train_data, dev_data, self.yes_token, self.no_token, self.tokenizer, self.args)
+        self.model.num_data = len(train_data)
+        self.trainer.fit(self.model, data_model)
+
+    def predict(self, test_data, cuda=True, process=True):
+        if process:
+            test_data = self.preprocess(test_data)
+
+        result = []
+        start = 0
+        if cuda:
+            self.model = self.model.cuda()
+        self.model.model.eval()
+        predict_model = UniMCPredict(
+            self.yes_token, self.no_token, self.model, self.tokenizer, self.args)
+        while start < len(test_data):
+            batch_data = test_data[start:start+self.args.batchsize]
+            start += self.args.batchsize
+            batch_result = predict_model.predict(batch_data)
+            result.extend(batch_result)
+        if process:
+            result = self.postprocess(result)
+        return result
+
+    def preprocess(self, data):
+
+        for i, line in enumerate(data):
+            if 'task_type' in line.keys() and line['task_type'] == '语义匹配':
+                data[i]['choice'] = ['不能理解为：'+data[i]
+                                     ['textb'], '可以理解为：'+data[i]['textb']]
+                # data[i]['question']='怎么理解这段话？'
+                data[i]['textb'] = ''
+
+            if 'task_type' in line.keys() and line['task_type'] == '自然语言推理':
+                data[i]['choice'] = ['不能推断出：'+data[i]['textb'],
+                                     '很难推断出：'+data[i]['textb'], '可以推断出：'+data[i]['textb']]
+                # data[i]['question']='根据这段话'
+                data[i]['textb'] = ''
+
+        return data
+
+    def postprocess(self, data):
+        for i, line in enumerate(data):
+            if 'task_type' in line.keys() and line['task_type'] == '语义匹配':
+                data[i]['textb'] = data[i]['choice'][0].replace('不能理解为：', '')
+                data[i]['choice'] = ['不相似', '相似']
+                ns = {}
+                for k, v in data[i]['score'].items():
+                    if '不能' in k:
+                        k = '不相似'
+                    if '可以' in k:
+                        k = '相似'
+                    ns[k] = v
+                data[i]['score'] = ns
+                data[i]['answer'] = data[i]['choice'][data[i]['label']]
+
+            if 'task_type' in line.keys() and line['task_type'] == '自然语言推理':
+                data[i]['textb'] = data[i]['choice'][0].replace('不能推断出：', '')
+                data[i]['choice'] = ['矛盾', '自然', '蕴含']
+                ns = {}
+                for k, v in data[i]['score'].items():
+                    if '不能' in k:
+                        k = '矛盾'
+                    if '很难' in k:
+                        k = '自然'
+                    if '可以' in k:
+                        k = '蕴含'
+                    ns[k] = v
+                data[i]['score'] = ns
+                data[i]['answer'] = data[i]['choice'][data[i]['label']]
+
+        return data
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+        # "No top_k"
+        preprocess_params = tokenizer_kwargs
+
+        postprocess_params = {}
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+
+        if isinstance(top_k, int) or top_k is None:
+            postprocess_params["top_k"] = top_k
+            postprocess_params["_legacy"] = False
+        elif return_all_scores is not None:
+            warnings.warn(
+                "`return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of"
+                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+                UserWarning,
+            )
+            if return_all_scores:
+                postprocess_params["top_k"] = None
+            else:
+                postprocess_params["top_k"] = 1
+
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+
+def load_data(data_path):
+    with open(data_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        samples = [json.loads(line) for line in tqdm(lines)]
+    return samples
+
+
+def comp_acc(pred_data, test_data):
+    corr = 0
+    for i in range(len(pred_data)):
+        if pred_data[i]['label'] == test_data[i]['label']:
+            corr += 1
+    return corr/len(pred_data)
+
+
+def main():
+    total_parser = argparse.ArgumentParser("TASK NAME")
+    total_parser.add_argument('--data_dir', default='./data', type=str)
+    total_parser.add_argument('--train_data', default='train.json', type=str)
+    total_parser.add_argument('--valid_data', default='dev.json', type=str)
+    total_parser.add_argument('--test_data', default='test.json', type=str)
+    total_parser.add_argument('--output_path', default='', type=str)
+    total_parser = UniMCPipelines.piplines_args(total_parser)
+    args = total_parser.parse_args()
+
+    train_data = load_data(os.path.join(args.data_dir, args.train_data))
+    dev_data = load_data(os.path.join(args.data_dir, args.valid_data))
+    test_data = load_data(os.path.join(args.data_dir, args.test_data))
+
+    dev_data_ori = copy.deepcopy(dev_data)
+
+    model = UniMCPipelines(args)
+
+    print(args.data_dir)
+
+    if args.train:
+        model.train(train_data, dev_data)
+    result = model.predict(dev_data)
+    for line in result[:20]:
+        print(line)
+
+    acc = comp_acc(result, dev_data_ori)
+    print('acc:', acc)
+
+    if args.output_path != '':
+        test_result = model.predict(test_data)
+        with open(args.output_path, 'w', encoding='utf8') as f:
+            for line in test_result:
+                json_data = json.dumps(line, ensure_ascii=False)
+                f.write(json_data+'\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fengshen/models/zen1/__init__.py b/fengshen/models/zen1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dec07c8fb965677ba8c8d3b0a13809d0199d301
--- /dev/null
+++ b/fengshen/models/zen1/__init__.py
@@ -0,0 +1,6 @@
+from .ngram_utils import ZenNgramDict, NGRAM_DICT_NAME
+from .modeling import ZenConfig, ZenModel, ZenForPreTraining, ZenForTokenClassification, ZenForSequenceClassification
+from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+version = "0.1.0"
+__all__ = ['ZenNgramDict', 'NGRAM_DICT_NAME', "ZenConfig", "ZenModel", "ZenForPreTraining", "ZenForTokenClassification",
+           "ZenForSequenceClassification", "BertTokenizer", "BasicTokenizer", "WordpieceTokenizer"]
diff --git a/fengshen/models/zen1/configuration_zen1.py b/fengshen/models/zen1/configuration_zen1.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7cbeb5657ea07b2a4e8429199a6091be39864c8
--- /dev/null
+++ b/fengshen/models/zen1/configuration_zen1.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TransfoXLDenoise model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class ZenConfig(PretrainedConfig):
+
+    """Configuration class to store the configuration of a `ZenModel`.
+    """
+
+    def __init__(self,
+                 #  vocab_size_or_config_json_file,
+                 #  word_vocab_size,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 num_hidden_word_layers=6,
+                 **kwargs):
+        """Constructs ZenConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+        """
+        # self.vocab_size = vocab_size_or_config_json_file
+        # self.word_size = word_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.num_hidden_word_layers = num_hidden_word_layers
+        super().__init__(**kwargs)
diff --git a/fengshen/models/zen1/modeling.py b/fengshen/models/zen1/modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..510c1f890c21517605d49e62655719ff6c8bd8b2
--- /dev/null
+++ b/fengshen/models/zen1/modeling.py
@@ -0,0 +1,1085 @@
+# coding: utf-8
+# Copyright 2019 Sinovation Ventures AI Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is partially derived from the code at
+# https://github.com/huggingface/transformers/tree/master/transformers
+#
+# Original copyright notice:
+#
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ZEN model classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
+import logging
+import math
+import os
+import sys
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedModel
+
+from .configuration_zen1 import ZenConfig
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese/resolve/main/pytorch_model.bin',
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese/resolve/main/config.json',
+}
+BERT_CONFIG_NAME = 'bert_config.json'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+              "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                lname = re.split(r'_(\d+)', m_name)
+            else:
+                lname = [m_name]
+            if lname[0] == 'kernel' or lname[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif lname[0] == 'output_bias' or lname[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif lname[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif lname[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, lname[0])
+                except AttributeError:
+                    print("Skipping {}".format("/".join(name)))
+                    continue
+            if len(lname) >= 2:
+                num = int(lname[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+try:
+    # from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+    from torch.nn import LayerNorm as BertLayerNorm
+except ImportError:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertWordEmbeddings(nn.Module):
+    """Construct the embeddings from ngram, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertWordEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.word_size, config.hidden_size, padding_idx=0)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        if self.keep_multihead_output:
+            self.multihead_output = context_layer
+            self.multihead_output.retain_grad()
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        if self.output_attentions:
+            return attention_probs, context_layer
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertAttention, self).__init__()
+        self.output_attentions = output_attentions
+        self.self = BertSelfAttention(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
+        self.output = BertSelfOutput(config)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_output = self.self(input_tensor, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, self_output = self_output
+        attention_output = self.output(self_output, input_tensor)
+        if self.output_attentions:
+            return attentions, attention_output
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        # if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertLayer, self).__init__()
+        self.output_attentions = output_attentions
+        self.attention = BertAttention(config, output_attentions=output_attentions,
+                                       keep_multihead_output=keep_multihead_output)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, attention_output = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attentions, layer_output
+        return layer_output
+
+
+class ZenEncoder(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        layer = BertLayer(config, output_attentions=output_attentions,
+                          keep_multihead_output=keep_multihead_output)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.word_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_word_layers)])
+        self.num_hidden_word_layers = config.num_hidden_word_layers
+
+    def forward(self, hidden_states, ngram_hidden_states, ngram_position_matrix, attention_mask,
+                ngram_attention_mask,
+                output_all_encoded_layers=True, head_mask=None):
+        # Need to check what is the attention masking doing here
+        all_encoder_layers = []
+        all_attentions = []
+        num_hidden_ngram_layers = self.num_hidden_word_layers
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+            if i < num_hidden_ngram_layers:
+                ngram_hidden_states = self.word_layers[i](ngram_hidden_states, ngram_attention_mask, head_mask[i])
+                if self.output_attentions:
+                    ngram_attentions, ngram_hidden_states = ngram_hidden_states
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
+            hidden_states += torch.bmm(ngram_position_matrix.float(), ngram_hidden_states.float())
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class ZenOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(ZenOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class ZenOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(ZenOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class ZenPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(ZenPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class ZenPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = ZenConfig
+    base_model_prefix = "IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class ZenModel(ZenPreTrainedModel):
+    """ZEN model ("BERT-based Chinese (Z) text encoder Enhanced by N-gram representations").
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.embeddings = BertEmbeddings(config)
+        self.word_embeddings = BertWordEmbeddings(config)
+        self.encoder = ZenEncoder(config, output_attentions=output_attentions,
+                                  keep_multihead_output=keep_multihead_output)
+        self.pooler = BertPooler(config)
+        self.init_weights()
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
+    def forward(self, input_ids,
+                input_ngram_ids,
+                ngram_position_matrix,
+                token_type_ids=None,
+                ngram_token_type_ids=None,
+                attention_mask=None,
+                ngram_attention_mask=None,
+                output_all_encoded_layers=True,
+                head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        if ngram_attention_mask is None:
+            ngram_attention_mask = torch.ones_like(input_ngram_ids)
+        if ngram_token_type_ids is None:
+            ngram_token_type_ids = torch.zeros_like(input_ngram_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_ngram_attention_mask = ngram_attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        extended_ngram_attention_mask = extended_ngram_attention_mask.to(dtype=next(self.parameters()).dtype)
+        extended_ngram_attention_mask = (1.0 - extended_ngram_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
+                    -1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        ngram_embedding_output = self.word_embeddings(input_ngram_ids, ngram_token_type_ids)
+
+        encoded_layers = self.encoder(embedding_output,
+                                      ngram_embedding_output,
+                                      ngram_position_matrix,
+                                      extended_attention_mask,
+                                      extended_ngram_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
+        return encoded_layers, pooled_output
+
+
+class ZenForPreTraining(ZenPreTrainedModel):
+    """ZEN model with pre-training heads.
+    This module comprises the ZEN model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForPreTraining, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.cls = ZenPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None,
+                ngram_token_type_ids=None,
+                attention_mask=None,
+                ngram_attention_mask=None,
+                masked_lm_labels=None,
+                next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids,
+                            input_ngram_ids,
+                            ngram_position_matrix,
+                            token_type_ids,
+                            ngram_token_type_ids,
+                            attention_mask,
+                            ngram_attention_mask,
+                            output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, pooled_output = outputs
+        else:
+            sequence_output, pooled_output = outputs
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            return total_loss
+        elif self.output_attentions:
+            return all_attentions, prediction_scores, seq_relationship_score
+        return prediction_scores, seq_relationship_score
+
+
+class ZenForMaskedLM(ZenPreTrainedModel):
+    """ZEN model with the masked language modeling head.
+    This module comprises the ZEN model followed by the masked language modeling head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForMaskedLM, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.cls = ZenOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            return masked_lm_loss
+        elif self.output_attentions:
+            return all_attentions, prediction_scores
+        return prediction_scores
+
+
+class ZenForNextSentencePrediction(ZenPreTrainedModel):
+    """ZEN model with next sentence prediction head.
+    This module comprises the ZEN model followed by the next sentence classification head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForNextSentencePrediction, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.cls = ZenOnlyNSPHead(config)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        seq_relationship_score = self.cls(pooled_output)
+
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            return next_sentence_loss
+        elif self.output_attentions:
+            return all_attentions, seq_relationship_score
+        return seq_relationship_score
+
+
+class ZenForSequenceClassification(ZenPreTrainedModel):
+    """ZEN model for classification.
+    This module is composed of the ZEN model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    """
+
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+        # super().__init__(config, num_labels, output_attentions, keep_multihead_output)
+        super().__init__(config)
+        self.config = config
+        self.output_attentions = output_attentions
+        self.num_labels = config.num_labels
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # print('logits***************', logits, labels)
+            # breakpoint()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss, logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return loss, logits
+
+
+class ZenForTokenClassification(ZenPreTrainedModel):
+    """ZEN model for token-level classification.
+    This module is composed of the ZEN model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    """
+
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+        super().__init__(config)
+        self.output_attentions = output_attentions
+        self.num_labels = config.num_labels
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
+                attention_mask_label=None, ngram_ids=None, ngram_positions=None, head_mask=None):
+        outputs = self.bert(input_ids, ngram_ids, ngram_positions, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+
+        batch_size, max_len, feat_dim = sequence_output.shape
+        valid_output = torch.zeros(batch_size, max_len, feat_dim, dtype=torch.float32, device=input_ids.device)
+
+        if self.num_labels == 38:
+            # just for POS to filter/mask input_ids=0
+            for i in range(batch_size):
+                temp = sequence_output[i][valid_ids[i] == 1]
+                valid_output[i][:temp.size(0)] = temp
+        else:
+            valid_output = sequence_output
+
+        sequence_output = self.dropout(valid_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=0)
+            # Only keep active parts of the loss
+            attention_mask_label = None
+            if attention_mask_label is not None:
+                active_loss = attention_mask_label.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss, logits
+        else:
+            return loss, logits
diff --git a/fengshen/models/zen1/ngram_utils.py b/fengshen/models/zen1/ngram_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..917f770fab84db4c8a55b11a296afdb61f8283c9
--- /dev/null
+++ b/fengshen/models/zen1/ngram_utils.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+# Copyright 2019 Sinovation Ventures AI Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for ngram for ZEN model."""
+
+import os
+import logging
+
+from transformers import cached_path
+
+NGRAM_DICT_NAME = 'ngram.txt'
+
+logger = logging.getLogger(__name__)
+PRETRAINED_VOCAB_ARCHIVE_MAP = {'IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese/resolve/main/ngram.txt'}
+
+
+class ZenNgramDict(object):
+    """
+    Dict class to store the ngram
+    """
+
+    def __init__(self, ngram_freq_path, tokenizer, max_ngram_in_seq=128):
+        """Constructs ZenNgramDict
+
+        :param ngram_freq_path: ngrams with frequency
+        """
+        if os.path.isdir(ngram_freq_path):
+            ngram_freq_path = os.path.join(ngram_freq_path, NGRAM_DICT_NAME)
+        self.ngram_freq_path = ngram_freq_path
+        self.max_ngram_in_seq = max_ngram_in_seq
+        self.id_to_ngram_list = ["[pad]"]
+        self.ngram_to_id_dict = {"[pad]": 0}
+        self.ngram_to_freq_dict = {}
+
+        logger.info("loading ngram frequency file {}".format(ngram_freq_path))
+        with open(ngram_freq_path, "r", encoding="utf-8") as fin:
+            for i, line in enumerate(fin):
+                ngram, freq = line.split(",")
+                tokens = tuple(tokenizer.tokenize(ngram))
+                self.ngram_to_freq_dict[ngram] = freq
+                self.id_to_ngram_list.append(tokens)
+                self.ngram_to_id_dict[tokens] = i + 1
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            ngram_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
+        else:
+            ngram_file = pretrained_model_name_or_path
+        if os.path.isdir(ngram_file):
+            ngram_file = os.path.join(ngram_file, NGRAM_DICT_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_ngram_file = cached_path(ngram_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        ngram_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        ngram_file))
+            return None
+        if resolved_ngram_file == ngram_file:
+            logger.info("loading vocabulary file {}".format(ngram_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                ngram_file, resolved_ngram_file))
+        # Instantiate ngram.
+        ngram_dict = cls(resolved_ngram_file, **kwargs)
+        return ngram_dict
+
+    def save(self, ngram_freq_path):
+        with open(ngram_freq_path, "w", encoding="utf-8") as fout:
+            for ngram, freq in self.ngram_to_freq_dict.items():
+                fout.write("{},{}\n".format(ngram, freq))
diff --git a/fengshen/models/zen1/tokenization.py b/fengshen/models/zen1/tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbc94e2417ff42ffcfb18284b8cb396415e630b1
--- /dev/null
+++ b/fengshen/models/zen1/tokenization.py
@@ -0,0 +1,438 @@
+# coding=utf-8
+# This file is derived from the code at
+# https://github.com/huggingface/transformers/blob/master/transformers/tokenization_bert.py
+#
+# Original copyright notice:
+#
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from transformers import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    'IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese/resolve/main/vocab.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
+    'bert-large-uncased-whole-word-masking': 512,
+    'bert-large-cased-whole-word-masking': 512,
+    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
+    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
+    'bert-base-cased-finetuned-mrpc': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                  never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/fengshen/models/zen2/__init__.py b/fengshen/models/zen2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88bea7b30667f90437256466401f5f73d03f398
--- /dev/null
+++ b/fengshen/models/zen2/__init__.py
@@ -0,0 +1,12 @@
+from .configuration_zen2 import ZenConfig
+from .modeling import ZenForPreTraining, ZenForTokenClassification, ZenForSequenceClassification, ZenForQuestionAnswering, ZenModel, ZenForMaskedLM
+from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer, _is_whitespace, whitespace_tokenize, convert_to_unicode, _is_punctuation, _is_control, VOCAB_NAME
+from .ngram_utils import ZenNgramDict, NGRAM_DICT_NAME, extract_ngram_feature, construct_ngram_matrix
+__all__ = [
+    'ZenConfig', 'ZenForPreTraining', 'ZenForTokenClassification', 'ZenForSequenceClassification',
+    'ZenForQuestionAnswering', 'ZenModel', 'ZenForMaskedLM', 'BertTokenizer', 'BasicTokenizer',
+    'WordpieceTokenizer', '_is_whitespace', 'whitespace_tokenize', 'convert_to_unicode',
+    '_is_punctuation', '_is_control', 'VOCAB_NAME', 'ZenNgramDict', 'NGRAM_DICT_NAME',
+    'extract_ngram_feature', 'construct_ngram_matrix',
+]
+version = "0.1.0"
diff --git a/fengshen/models/zen2/configuration_zen2.py b/fengshen/models/zen2/configuration_zen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7cbeb5657ea07b2a4e8429199a6091be39864c8
--- /dev/null
+++ b/fengshen/models/zen2/configuration_zen2.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TransfoXLDenoise model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class ZenConfig(PretrainedConfig):
+
+    """Configuration class to store the configuration of a `ZenModel`.
+    """
+
+    def __init__(self,
+                 #  vocab_size_or_config_json_file,
+                 #  word_vocab_size,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 num_hidden_word_layers=6,
+                 **kwargs):
+        """Constructs ZenConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+        """
+        # self.vocab_size = vocab_size_or_config_json_file
+        # self.word_size = word_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.num_hidden_word_layers = num_hidden_word_layers
+        super().__init__(**kwargs)
diff --git a/fengshen/models/zen2/modeling.py b/fengshen/models/zen2/modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..85e5c2d0a53a7fc3ae8fab6acea08bb57b4db537
--- /dev/null
+++ b/fengshen/models/zen2/modeling.py
@@ -0,0 +1,1385 @@
+# coding: utf-8
+# Copyright 2019 Sinovation Ventures AI Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is partially derived from the code at
+# https://github.com/huggingface/transformers/tree/master/transformers
+#
+# Original copyright notice:
+#
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ZEN2 model classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import logging
+import math
+import os
+import sys
+
+sys.path.append("/cognitive_comp/lujunyu/TMP/Fengshenbang-LM")
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from dataclasses import dataclass
+from typing import Optional
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput
+
+from fengshen.models.zen2.configuration_zen2 import ZenConfig
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+    'IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese/resolve/main/pytorch_model.bin',
+    'IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese/resolve/main/pytorch_model.bin',
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+    'IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese/resolve/main/config.json',
+    'IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese/resolve/main/config.json',
+}
+BERT_CONFIG_NAME = 'bert_config.json'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+@dataclass
+class MaskedLMOutput(ModelOutput):
+    """
+    Base class for masked language models outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Masked language modeling (MLM) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+              "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                name_lists = re.split(r'_(\d+)', m_name)
+            else:
+                name_lists = [m_name]
+            if name_lists[0] == 'kernel' or name_lists[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif name_lists[0] == 'output_bias' or name_lists[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif name_lists[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif name_lists[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, name_lists[0])
+                except AttributeError:
+                    print("Skipping {}".format("/".join(name)))
+                    continue
+            if len(name_lists) >= 2:
+                num = int(name_lists[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+try:
+    # from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+    from torch.nn import LayerNorm as BertLayerNorm
+except ImportError:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertWordEmbeddings(nn.Module):
+    """Construct the embeddings from ngram, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertWordEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.word_size, config.hidden_size, padding_idx=0)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RelativeSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored.
+    """
+
+    def __init__(self, embedding_dim, padding_idx, init_size=1568):
+        """
+
+        :param embedding_dim: 每个位置的dimension
+        :param padding_idx:
+        :param init_size:
+        """
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        assert init_size % 2 == 0
+        weights = self.get_embedding(
+            init_size+1,
+            embedding_dim,
+            padding_idx,
+        )
+        self.register_buffer('weights', weights)
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+
+    def get_embedding(self, num_embeddings, embedding_dim, padding_idx=None):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(-num_embeddings//2, num_embeddings//2, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        self.origin_shift = num_embeddings//2 + 1
+        return emb
+
+    def forward(self, input):
+        """Input is expected to be of size [bsz x seqlen].
+        """
+        bsz, _, _, seq_len = input.size()
+        max_pos = self.padding_idx + seq_len
+        if max_pos > self.origin_shift:
+            # recompute/expand embeddings if needed
+            weights = self.get_embedding(
+                max_pos*2,
+                self.embedding_dim,
+                self.padding_idx,
+            )
+            weights = weights.to(self._float_tensor)
+            del self.weights
+            self.origin_shift = weights.size(0)//2
+            self.register_buffer('weights', weights)
+
+        positions = torch.arange(-seq_len, seq_len).to(input.device).long() + self.origin_shift  # 2*seq_len
+        embed = self.weights.index_select(0, positions.long()).detach()
+        return embed
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.softmax = nn.Softmax(dim=-1)
+
+        self.position_embedding = RelativeSinusoidalPositionalEmbedding(self.attention_head_size, 0, 1200)
+        self.r_r_bias = nn.Parameter(
+            nn.init.xavier_normal_(torch.zeros(self.num_attention_heads, self.attention_head_size)))
+        self.r_w_bias = nn.Parameter(
+            nn.init.xavier_normal_(torch.zeros(self.num_attention_heads, self.attention_head_size)))
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        position_embedding = self.position_embedding(attention_mask)
+
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        rw_head_q = query_layer + self.r_r_bias[:, None]
+        AC = torch.einsum('bnqd,bnkd->bnqk', [rw_head_q.float(), key_layer.float()])  # b x n x l x d, n是head
+
+        D_ = torch.einsum('nd,ld->nl', self.r_w_bias.float(), position_embedding.float())[None, :,
+                                                                                          None]  # head x 2max_len, 每个head对位置的bias
+        B_ = torch.einsum('bnqd,ld->bnql', query_layer.float(),
+                          position_embedding.float())  # bsz x head  x max_len x 2max_len，每个query对每个shift的偏移
+        BD = B_ + D_  # bsz x head x max_len x 2max_len, 要转换为bsz x head x max_len x max_len
+        BD = self._shift(BD)
+        attention_scores = AC + BD
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = self.softmax(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs.type_as(value_layer), value_layer)
+        if self.keep_multihead_output:
+            self.multihead_output = context_layer
+            self.multihead_output.retain_grad()
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        if self.output_attentions:
+            return attention_probs, context_layer
+        return context_layer
+
+    def _shift(self, BD):
+        """
+        类似
+        -3 -2 -1 0 1 2
+        -3 -2 -1 0 1 2
+        -3 -2 -1 0 1 2
+
+        转换为
+        0   1  2
+        -1  0  1
+        -2 -1  0
+
+        :param BD: batch_size x n_head x max_len x 2max_len
+        :return: batch_size x n_head x max_len x max_len
+        """
+        bsz, n_head, max_len, _ = BD.size()
+        zero_pad = BD.new_zeros(bsz, n_head, max_len, 1)
+        BD = torch.cat([BD, zero_pad], dim=-1).view(bsz, n_head, -1, max_len)  # bsz x n_head x (2max_len+1) x max_len
+        BD = BD[:, :, :-1].view(bsz, n_head, max_len, -1)  # bsz x n_head x 2max_len x max_len
+        BD = BD[:, :, :, max_len:]
+        return BD
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertAttention, self).__init__()
+        self.output_attentions = output_attentions
+        self.self = BertSelfAttention(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
+        self.output = BertSelfOutput(config)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_output = self.self(input_tensor, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, self_output = self_output
+        attention_output = self.output(self_output, input_tensor)
+        if self.output_attentions:
+            return attentions, attention_output
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        # if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertLayer, self).__init__()
+        self.output_attentions = output_attentions
+        self.attention = BertAttention(config, output_attentions=output_attentions,
+                                       keep_multihead_output=keep_multihead_output)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, attention_output = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attentions, layer_output
+        return layer_output
+
+
+class ZenEncoder(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        layer = BertLayer(config, output_attentions=output_attentions,
+                          keep_multihead_output=keep_multihead_output)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.word_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_word_layers)])
+        self.num_hidden_word_layers = config.num_hidden_word_layers
+
+    def forward(self, hidden_states, ngram_hidden_states, ngram_position_matrix, attention_mask,
+                ngram_attention_mask,
+                output_all_encoded_layers=True, head_mask=None):
+        # Need to check what is the attention masking doing here
+        all_encoder_layers = []
+        all_attentions = []
+        num_hidden_ngram_layers = self.num_hidden_word_layers
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+            if i < num_hidden_ngram_layers:
+                ngram_hidden_states = self.word_layers[i](ngram_hidden_states, ngram_attention_mask, head_mask[i])
+                if self.output_attentions:
+                    ngram_attentions, ngram_hidden_states = ngram_hidden_states
+                    all_attentions.append(ngram_attentions)
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
+            hidden_states += torch.bmm(ngram_position_matrix.float(), ngram_hidden_states.float())
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        # if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class ZenOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(ZenOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class ZenOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(ZenOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class ZenPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(ZenPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class ZenPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = ZenConfig
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class ZenModel(ZenPreTrainedModel):
+    """ZEN model ("BERT-based Chinese (Z) text encoder Enhanced by N-gram representations").
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.embeddings = BertEmbeddings(config)
+        self.word_embeddings = BertWordEmbeddings(config)
+        self.encoder = ZenEncoder(config, output_attentions=output_attentions,
+                                  keep_multihead_output=keep_multihead_output)
+        self.pooler = BertPooler(config)
+        self.init_weights()
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
+    def forward(self, input_ids,
+                input_ngram_ids,
+                ngram_position_matrix,
+                token_type_ids=None,
+                ngram_token_type_ids=None,
+                attention_mask=None,
+                ngram_attention_mask=None,
+                output_all_encoded_layers=True,
+                head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        if ngram_attention_mask is None:
+            ngram_attention_mask = torch.ones_like(input_ngram_ids)
+        if ngram_token_type_ids is None:
+            ngram_token_type_ids = torch.zeros_like(input_ngram_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_ngram_attention_mask = ngram_attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        extended_ngram_attention_mask = extended_ngram_attention_mask.to(dtype=next(self.parameters()).dtype)
+        extended_ngram_attention_mask = (1.0 - extended_ngram_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
+                    -1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        ngram_embedding_output = self.word_embeddings(input_ngram_ids, ngram_token_type_ids)
+
+        encoded_layers = self.encoder(embedding_output,
+                                      ngram_embedding_output,
+                                      ngram_position_matrix,
+                                      extended_attention_mask,
+                                      extended_ngram_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
+        return encoded_layers, pooled_output
+
+
+class ZenForPreTraining(ZenPreTrainedModel):
+    """ZEN model with pre-training heads.
+    This module comprises the ZEN model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForPreTraining, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.cls = ZenPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None,
+                ngram_token_type_ids=None,
+                attention_mask=None,
+                ngram_attention_mask=None,
+                masked_lm_labels=None,
+                next_sentence_label=None, head_mask=None):
+        
+        outputs = self.bert(input_ids,
+                            input_ngram_ids,
+                            ngram_position_matrix,
+                            token_type_ids,
+                            ngram_token_type_ids,
+                            attention_mask,
+                            ngram_attention_mask,
+                            output_all_encoded_layers=False, head_mask=head_mask)
+        
+        if self.output_attentions:
+            all_attentions, sequence_output, pooled_output = outputs
+        else:
+            sequence_output, pooled_output = outputs
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+            return BertForPreTrainingOutput(loss=total_loss,prediction_logits=prediction_scores)
+        elif self.output_attentions:
+            return all_attentions, prediction_scores, seq_relationship_score
+        return prediction_scores, seq_relationship_score
+
+
+class ZenForMaskedLM(ZenPreTrainedModel):
+    """ZEN model with the masked language modeling head.
+    This module comprises the ZEN model followed by the masked language modeling head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForMaskedLM, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.cls = ZenOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, ngram_attention_mask=None, masked_lm_labels=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids, None, attention_mask, ngram_attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            return MaskedLMOutput(loss=masked_lm_loss,logits=prediction_scores)
+        elif self.output_attentions:
+            return all_attentions, prediction_scores
+        return MaskedLMOutput(loss=masked_lm_loss,logits=prediction_scores)
+
+
+class ZenForNextSentencePrediction(ZenPreTrainedModel):
+    """ZEN model with next sentence prediction head.
+    This module comprises the ZEN model followed by the next sentence classification head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForNextSentencePrediction, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.cls = ZenOnlyNSPHead(config)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        seq_relationship_score = self.cls(pooled_output)
+
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            return next_sentence_loss
+        elif self.output_attentions:
+            return all_attentions, seq_relationship_score
+        return seq_relationship_score
+
+
+class ZenForSequenceClassification(ZenPreTrainedModel):
+    """ZEN model for classification.
+    This module is composed of the ZEN model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    """
+
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+        super(ZenForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.num_labels = config.num_labels
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids,
+                            attention_mask=attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss, logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return loss, logits
+
+
+@dataclass
+class TokenClassifierOutput:
+    """
+    Base class for outputs of token classification models.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+
+
+class ZenForTokenClassification(ZenPreTrainedModel):
+    """ZEN model for token-level classification.
+    This module is composed of the ZEN model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        `input_ngram_ids`: input_ids of ngrams.
+        `ngram_token_type_ids`: token_type_ids of ngrams.
+        `ngram_attention_mask`: attention_mask of ngrams.
+        `ngram_position_matrix`: position matrix of ngrams.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    """
+
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+        super(ZenForTokenClassification, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.num_labels = config.num_labels
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
+                input_ngram_ids=None, ngram_position_matrix=None, head_mask=None, b_use_valid_filter=False):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids,
+                            attention_mask=attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+
+        # if b_use_valid_filter:
+        #     batch_size, max_len, feat_dim = sequence_output.shape
+        #     valid_output = torch.zeros(batch_size, max_len, feat_dim, dtype=sequence_output.dtype,
+        #                                device=input_ids.device)
+        #     for i in range(batch_size):
+        #         temp = sequence_output[i][valid_ids[i] == 1]
+        #         valid_output[i][:temp.size(0)] = temp
+        # else:
+        #     valid_output = sequence_output
+        valid_output = sequence_output
+
+        sequence_output = self.dropout(valid_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=0)
+            # Only keep active parts of the loss
+            # attention_mask_label = None
+            # if attention_mask_label is not None:
+            if attention_mask is not None:
+                # active_loss = attention_mask_label.view(-1) == 1
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return TokenClassifierOutput(loss, logits)
+        else:
+            return TokenClassifierOutput(loss, logits)
+
+
+class ZenForQuestionAnswering(ZenPreTrainedModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(ZenForQuestionAnswering, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = ZenModel(config, output_attentions=output_attentions,
+                             keep_multihead_output=keep_multihead_output)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.init_weights()
+
+    def forward(self, input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, head_mask=None):
+        outputs = self.bert(input_ids, input_ngram_ids, ngram_position_matrix, token_type_ids,
+                            attention_mask=attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits
diff --git a/fengshen/models/zen2/ngram_utils.py b/fengshen/models/zen2/ngram_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf474bf6f11c91159b9fdbaaeb2a3f7b6858539
--- /dev/null
+++ b/fengshen/models/zen2/ngram_utils.py
@@ -0,0 +1,192 @@
+# coding: utf-8
+# Copyright 2019 Sinovation Ventures AI Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for ngram for ZEN2 model."""
+
+import os
+import logging
+import math
+import numpy as np
+import torch
+from transformers import cached_path
+
+NGRAM_DICT_NAME = 'ngram.txt'
+
+logger = logging.getLogger(__name__)
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese/resolve/main/ngram.txt',
+    'IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese/resolve/main/ngram.txt',
+}
+
+
+class ZenNgramDict(object):
+    """
+    Dict class to store the ngram
+    """
+
+    def __init__(self, ngram_freq_path, tokenizer=None, max_ngram_in_seq=128):
+        """Constructs ZenNgramDict
+
+        :param ngram_freq_path: ngrams with frequency
+        """
+        if os.path.isdir(ngram_freq_path):
+            ngram_freq_path = os.path.join(ngram_freq_path, NGRAM_DICT_NAME)
+        self.ngram_freq_path = ngram_freq_path
+        self.max_ngram_in_seq = max_ngram_in_seq
+        self.max_ngram_len = 8
+        self.id_to_ngram_list = ["[pad]"]
+        self.ngram_to_id_dict = {"[pad]": 0}
+        self.ngram_to_freq_dict = {}
+
+        logger.info("loading ngram frequency file {}".format(ngram_freq_path))
+        with open(ngram_freq_path, "r", encoding="utf-8") as fin:
+            for i, line in enumerate(fin):
+                items = line.strip().split(",")
+                if len(items) != 2:
+                    continue
+                ngram, freq = items
+                # self.ngram_to_freq_dict[ngram] = int(freq)
+                if tokenizer:
+                    tokens = tuple(tokenizer.tokenize(ngram))
+                    if len([token for token in tokens if "[UNK]" in token]) > 0:
+                        tokens = ngram
+                else:
+                    tokens = tuple(ngram.split(" "))
+                self.id_to_ngram_list.append(tokens)
+                self.ngram_to_id_dict[tokens] = i + 1
+                self.ngram_to_freq_dict[tokens] = int(freq)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            ngram_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
+        else:
+            ngram_file = pretrained_model_name_or_path
+        if os.path.isdir(ngram_file):
+            ngram_file = os.path.join(ngram_file, NGRAM_DICT_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_ngram_file = cached_path(ngram_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        ngram_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        ngram_file))
+            return None
+        if resolved_ngram_file == ngram_file:
+            logger.info("loading vocabulary file {}".format(ngram_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                ngram_file, resolved_ngram_file))
+        # Instantiate ngram.
+        ngram_dict = cls(resolved_ngram_file, **kwargs)
+        return ngram_dict
+
+    def save(self, ngram_freq_path):
+        ngram_freq_path = os.path.join(ngram_freq_path, NGRAM_DICT_NAME)
+        with open(ngram_freq_path, "w+", encoding="utf-8") as fout:
+            for ngram, freq in self.ngram_to_freq_dict.items():
+                fout.write("{},{}\n".format(" ".join(ngram), freq))
+
+
+def extract_ngram_feature(tokens, ngram_dict, max_seq_len, seg_id_limit):
+    # ----------- code for ngram BEGIN-----------
+    ngram_matches = []
+    #  Filter the word segment from 2 to max_ngram_len to check whether there is a word
+    max_gram_n = ngram_dict.max_ngram_len
+    for p in range(2, max_gram_n):
+        for q in range(0, len(tokens) - p + 1):
+            character_segment = tokens[q:q + p]
+            # j is the starting position of the word
+            # i is the length of the current word
+            character_segment = tuple(character_segment)
+            if character_segment in ngram_dict.ngram_to_id_dict:
+                ngram_index = ngram_dict.ngram_to_id_dict[character_segment]
+                ngram_freq = ngram_dict.ngram_to_freq_dict[character_segment]
+                ngram_matches.append([ngram_index, q, p, character_segment, ngram_freq])
+
+    # shuffle(ngram_matches)
+    ngram_matches = sorted(ngram_matches, key=lambda s: s[0])
+    # max_word_in_seq_proportion = max_word_in_seq
+    max_word_in_seq_proportion = math.ceil((len(tokens) / max_seq_len) * ngram_dict.max_ngram_in_seq)
+    if len(ngram_matches) > max_word_in_seq_proportion:
+        ngram_matches = ngram_matches[:max_word_in_seq_proportion]
+    ngram_ids = [ngram[0] for ngram in ngram_matches]
+    ngram_positions = [ngram[1] for ngram in ngram_matches]
+    ngram_lengths = [ngram[2] for ngram in ngram_matches]
+    ngram_tuples = [ngram[3] for ngram in ngram_matches]
+    ngram_freqs = [ngram[4] for ngram in ngram_matches]
+    ngram_seg_ids = [0 if position < seg_id_limit else 1 for position in
+                     ngram_positions]
+
+    ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool)
+    ngram_mask_array[:len(ngram_ids)] = 1
+
+    # Zero-pad up to the max word in seq length.
+    padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids))
+    ngram_ids += padding
+    ngram_positions += padding
+    ngram_lengths += padding
+    ngram_seg_ids += padding
+    ngram_freqs += padding
+
+    # ----------- code for ngram END-----------
+
+    return {
+        "ngram_ids": ngram_ids,
+        "ngram_positions": ngram_positions,
+        "ngram_lengths": ngram_lengths,
+        "ngram_tuples": ngram_tuples,
+        "ngram_seg_ids": ngram_seg_ids,
+        "ngram_masks": ngram_mask_array,
+        "ngram_freqs": ngram_freqs,
+    }
+
+
+def construct_ngram_matrix(ngram_data, max_seq_length):
+    max_ngram_in_sequence = len(ngram_data["ngram_ids"])
+    ngram_ids_num = len([x for x in ngram_data["ngram_masks"] if x == 1])
+
+    ngram_positions_matrix = np.zeros(shape=(max_seq_length, max_ngram_in_sequence), dtype=np.float)
+    for i in range(ngram_ids_num):
+        ngram_positions_matrix[ngram_data["ngram_positions"][i]:
+                               ngram_data["ngram_positions"][i] + ngram_data["ngram_lengths"][i], i] = \
+            ngram_data["ngram_freqs"][i]
+    ngram_positions_matrix_t = torch.from_numpy(ngram_positions_matrix.astype(np.float))
+    ngram_positions_matrix_t = torch.div(ngram_positions_matrix_t,
+                                         torch.stack([torch.sum(ngram_positions_matrix_t, 1)] * ngram_positions_matrix_t.size(1)).t() + 1e-10)
+
+    return ngram_positions_matrix_t.numpy()
diff --git a/fengshen/models/zen2/tokenization.py b/fengshen/models/zen2/tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc41371dcb579d4417db22e830d7cea46bc6212
--- /dev/null
+++ b/fengshen/models/zen2/tokenization.py
@@ -0,0 +1,460 @@
+# coding=utf-8
+# This file is derived from the code at
+# https://github.com/huggingface/transformers/blob/master/transformers/tokenization_bert.py
+#
+# Original copyright notice:
+#
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import six
+import unicodedata
+from io import open
+
+from transformers import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    'IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese/resolve/main/vocab.txt',
+    'IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese': 'https://huggingface.co./IDEA-CCNL/Erlangshen-ZEN2-668M-Chinese/resolve/main/vocab.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
+    'bert-large-uncased-whole-word-masking': 512,
+    'bert-large-cased-whole-word-masking': 512,
+    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
+    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
+    'bert-base-cased-finetuned-mrpc': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        # elif isinstance(text, unicode):
+        #     return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                  never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab.get(token, self.vocab.get("[UNK]")))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/fengshen/pipelines/base.py b/fengshen/pipelines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e4a109c3d8a232201a255ba1a5bb77f008a78c
--- /dev/null
+++ b/fengshen/pipelines/base.py
@@ -0,0 +1,2 @@
+_CONFIG_MODEL_TYPE = 'fengshen_model_type'
+_CONFIG_TOKENIZER_TYPE = 'fengshen_tokenizer_type'
diff --git a/fengshen/pipelines/multiplechoice.py b/fengshen/pipelines/multiplechoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..39293ee24d6262dbea6d75aca8f3a76d3a37a259
--- /dev/null
+++ b/fengshen/pipelines/multiplechoice.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import basicConfig
+import torch
+from torch import nn
+import json
+from tqdm import tqdm
+import os
+import numpy as np
+from transformers import BertTokenizer
+import pytorch_lightning as pl
+
+from pytorch_lightning import trainer, loggers
+from transformers import AlbertTokenizer
+from transformers import AutoConfig
+from transformers.pipelines.base import Pipeline
+import argparse
+import copy
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import warnings
+from fengshen.models.unimc.modeling_unimc import (
+    UniMCDataModel,
+    UniMCLitModel,
+    UniMCPredict,
+)
+
+
+class UniMCPipelines(Pipeline):
+    @staticmethod
+    def piplines_args(parent_args):
+        total_parser = parent_args.add_argument_group("piplines args")
+        total_parser.add_argument(
+            '--pretrained_model_path', default='', type=str)
+        total_parser.add_argument('--load_checkpoints_path',
+                                  default='', type=str)
+        total_parser.add_argument('--train', action='store_true')
+        total_parser.add_argument('--language',
+                                  default='chinese', type=str)
+
+        total_parser = UniMCDataModel.add_data_specific_args(total_parser)
+        total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+        total_parser = UniMCLitModel.add_model_specific_args(total_parser)
+        total_parser = pl.Trainer.add_argparse_args(parent_args)
+        return parent_args
+
+    def __init__(self, args, model_path):
+        self.args = args
+        self.checkpoint_callback = UniversalCheckpoint(args)
+        self.logger = loggers.TensorBoardLogger(save_dir=args.default_root_dir)
+        self.trainer = pl.Trainer.from_argparse_args(args,
+                                                     logger=self.logger,
+                                                     callbacks=[self.checkpoint_callback])
+        self.config = AutoConfig.from_pretrained(model_path)
+        if self.config.model_type == 'albert':
+            self.tokenizer = AlbertTokenizer.from_pretrained(
+                model_path)
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(
+                model_path)
+
+        if args.language == 'chinese':
+            self.yes_token = self.tokenizer.encode('是')[1]
+            self.no_token = self.tokenizer.encode('非')[1]
+        else:
+            self.yes_token = self.tokenizer.encode('yes')[1]
+            self.no_token = self.tokenizer.encode('no')[1]
+
+        if args.load_checkpoints_path != '':
+            self.model = UniMCLitModel.load_from_checkpoint(
+                args.load_checkpoints_path, args=args, yes_token=self.yes_token, model_path=model_path)
+            print('load model from: ', args.load_checkpoints_path)
+        else:
+            self.model = UniMCLitModel(
+                args, yes_token=self.yes_token, model_path=model_path)
+
+    def train(self, train_data, dev_data, process=True):
+        if process:
+            train_data = self.preprocess(train_data)
+            dev_data = self.preprocess(dev_data)
+        data_model = UniMCDataModel(
+            train_data, dev_data, self.yes_token, self.no_token, self.tokenizer, self.args)
+        self.model.num_data = len(train_data)
+        self.trainer.fit(self.model, data_model)
+
+    def predict(self, test_data, cuda=True, process=True):
+        if process:
+            test_data = self.preprocess(test_data)
+
+        result = []
+        start = 0
+        if cuda:
+            self.model = self.model.cuda()
+        self.model.model.eval()
+        predict_model = UniMCPredict(
+            self.yes_token, self.no_token, self.model, self.tokenizer, self.args)
+        while start < len(test_data):
+            batch_data = test_data[start:start+self.args.batchsize]
+            start += self.args.batchsize
+            batch_result = predict_model.predict(batch_data)
+            result.extend(batch_result)
+        if process:
+            result = self.postprocess(result)
+        return result
+
+    def preprocess(self, data):
+
+        for i, line in enumerate(data):
+            if 'task_type' in line.keys() and line['task_type'] == '语义匹配':
+                data[i]['choice'] = ['不能理解为：'+data[i]
+                                     ['textb'], '可以理解为：'+data[i]['textb']]
+                # data[i]['question']='怎么理解这段话？'
+                data[i]['textb'] = ''
+
+            if 'task_type' in line.keys() and line['task_type'] == '自然语言推理':
+                data[i]['choice'] = ['不能推断出：'+data[i]['textb'],
+                                     '很难推断出：'+data[i]['textb'], '可以推断出：'+data[i]['textb']]
+                # data[i]['question']='根据这段话'
+                data[i]['textb'] = ''
+
+        return data
+
+    def postprocess(self, data):
+        for i, line in enumerate(data):
+            if 'task_type' in line.keys() and line['task_type'] == '语义匹配':
+                data[i]['textb'] = data[i]['choice'][0].replace('不能理解为：', '')
+                data[i]['choice'] = ['不相似', '相似']
+                ns = {}
+                for k, v in data[i]['score'].items():
+                    if '不能' in k:
+                        k = '不相似'
+                    if '可以' in k:
+                        k = '相似'
+                    ns[k] = v
+                data[i]['score'] = ns
+                data[i]['answer'] = data[i]['choice'][data[i]['label']]
+
+            if 'task_type' in line.keys() and line['task_type'] == '自然语言推理':
+                data[i]['textb'] = data[i]['choice'][0].replace('不能推断出：', '')
+                data[i]['choice'] = ['矛盾', '自然', '蕴含']
+                ns = {}
+                for k, v in data[i]['score'].items():
+                    if '不能' in k:
+                        k = '矛盾'
+                    if '很难' in k:
+                        k = '自然'
+                    if '可以' in k:
+                        k = '蕴含'
+                    ns[k] = v
+                data[i]['score'] = ns
+                data[i]['answer'] = data[i]['choice'][data[i]['label']]
+
+        return data
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+        # "No top_k"
+        preprocess_params = tokenizer_kwargs
+
+        postprocess_params = {}
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+
+        if isinstance(top_k, int) or top_k is None:
+            postprocess_params["top_k"] = top_k
+            postprocess_params["_legacy"] = False
+        elif return_all_scores is not None:
+            warnings.warn(
+                "`return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of"
+                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+                UserWarning,
+            )
+            if return_all_scores:
+                postprocess_params["top_k"] = None
+            else:
+                postprocess_params["top_k"] = 1
+
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
diff --git a/fengshen/pipelines/sequence_tagging.py b/fengshen/pipelines/sequence_tagging.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5258d881d74260b1096dd07634ae905fdef7b44
--- /dev/null
+++ b/fengshen/pipelines/sequence_tagging.py
@@ -0,0 +1,313 @@
+import torch
+import torch.nn.functional as F
+from torch.utils.data._utils.collate import default_collate
+from dataclasses import dataclass
+from typing import Dict, List, Union
+
+from fengshen.models.tagging_models.bert_for_tagging import BertLinear,BertCrf,BertSpan,BertBiaffine
+from fengshen.data.sequence_tagging_dataloader.sequence_tagging_collator import CollatorForLinear, CollatorForCrf, CollatorForSpan, CollatorForBiaffine
+from fengshen.data.sequence_tagging_dataloader.sequence_tagging_datasets import DataProcessor, get_datasets
+from fengshen.metric.metric import EntityScore
+from fengshen.metric.utils_ner import get_entities, bert_extract_item
+from transformers import (
+    BertConfig,
+    AutoTokenizer, BertTokenizer
+)
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+from transformers.pipelines.base import PipelineException, GenericTensor
+from transformers import TokenClassificationPipeline as HuggingfacePipe
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import LearningRateMonitor
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.models.model_utils import add_module_args
+from fengshen.models.model_utils import configure_optimizers
+from fengshen.models.model_utils import get_total_steps
+
+_model_dict={
+    'bert-linear': BertLinear,
+    'bert-crf': BertCrf,
+    'bert-span': BertSpan,
+    'bert-biaffine': BertBiaffine
+}
+
+_collator_dict={
+    'linear': CollatorForLinear,
+    'crf': CollatorForCrf,
+    'span': CollatorForSpan,
+    'biaffine': CollatorForBiaffine
+}
+
+
+class _taskModel(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        parser = parent_args.add_argument_group('sequence tagging task model')
+        parser.add_argument('--data_dir', default=None, type=str)
+        parser.add_argument('--model_type', default='bert', type=str)
+        parser.add_argument("--decode_type", default="linear", choices=["linear", "crf", "biaffine", "span"], type=str)
+        parser.add_argument('--loss_type', default='ce', type=str)
+        return parent_args
+    
+    def __init__(self, args, model, label2id, validate_fn):
+        super().__init__()
+        self.label2id = label2id
+        self.id2label = {v: k for k, v in self.label2id.items()}
+
+        self.model=model
+        self.validate_fn=getattr(self,validate_fn)
+
+        self.entity_score=EntityScore()
+
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            print('Total steps: {}' .format(self.total_steps))
+    
+    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        self.log('train_loss', loss)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        self.validate_fn(batch,batch_idx)
+
+    def validation_linear(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.logits
+
+        preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
+        preds = preds.detach().cpu().numpy()
+        labels = batch['labels'].detach().cpu().numpy()
+
+        for i, label in enumerate(labels):
+            y_true = []
+            y_pred = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif j == (torch.sum(batch['attention_mask'][i]).item()-1):
+                    true_subject=get_entities(y_true,self.id2label)
+                    pred_subject=get_entities(y_pred,self.id2label)
+                    self.entity_score.update(true_subject=true_subject, pred_subject=pred_subject)
+                    break
+                else:
+                    y_true.append(self.id2label[labels[i][j]])
+                    y_pred.append(self.id2label[preds[i][j]])
+        
+        self.log('val_loss', loss)
+
+    def validation_crf(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.logits
+
+        preds = self.model.crf.decode(logits, batch['attention_mask'])
+        preds = preds.detach().squeeze(0).cpu().numpy().tolist()
+        labels = batch['labels'].detach().cpu().numpy()
+
+        for i, label in enumerate(labels):
+            y_true = []
+            y_pred = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif j == (torch.sum(batch['attention_mask'][i]).item()-1):
+                    true_subject=get_entities(y_true,self.id2label)
+                    pred_subject=get_entities(y_pred,self.id2label)
+                    self.entity_score.update(true_subject=true_subject, pred_subject=pred_subject)
+                    break
+                else:
+                    y_true.append(self.id2label[labels[i][j]])
+                    y_pred.append(self.id2label[preds[i][j]])
+
+        self.log('val_loss', loss)
+
+    def validation_span(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        start_logits = outputs.start_logits
+        end_logits = outputs.end_logits
+        labels=batch['subjects']
+        for i, T in enumerate(labels):
+            active_start_logits=start_logits[i][:batch['input_len'][i]]
+            active_end_logits=end_logits[i][:batch['input_len'][i]]
+            R = bert_extract_item(active_start_logits, active_end_logits)
+
+            T=T[~torch.all(T==-1,dim=-1)].cpu().numpy()
+            T=list(map(lambda x:(self.id2label[x[0]],x[1],x[2]),T))
+            R=list(map(lambda x:(self.id2label[x[0]],x[1],x[2]),R))
+
+            self.entity_score.update(true_subject=T, pred_subject=R)
+        self.log('val_loss', loss)
+
+    def validation_biaffine(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        logits = outputs.span_logits
+
+        preds = torch.argmax(logits.cpu().numpy(), axis=-1)
+        labels = batch['span_labels'].cpu().numpy()
+
+        for i, label in enumerate(labels):
+            input_len=(batch['input_len'][i])-2
+            active_label=labels[i,1:input_len+1,1:input_len+1]
+            active_pred=preds[i,1:input_len+1,1:input_len+1]
+
+            temp_1 = []
+            temp_2 = []
+
+            for j in range(input_len):
+                for k in range(input_len):
+                    if self.id2label[active_label[j,k]]!="O":
+                        temp_1.append([self.id2label[active_label[j,k]],j,k])
+                    if self.id2label[active_pred[j,k]]!="O":
+                        temp_2.append([self.id2label[active_pred[j,k]],j,k])
+
+            self.entity_score.update(pred_subject=temp_2, true_subject=temp_1)
+
+        self.log('val_loss', loss)
+    
+    def validation_epoch_end(self, outputs):
+        # compute metric for all process
+        score_dict, _ = self.entity_score.result()
+        if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0:
+            print('score_dict:\n', score_dict)
+        # reset the metric after once validation
+        self.entity_score.reset()
+        for k, v in score_dict.items():
+            self.log('val_{}'.format(k), v)
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+
+class SequenceTaggingPipeline(HuggingfacePipe):
+    @staticmethod
+    def add_pipeline_specific_args(parent_args):
+        parser = parent_args.add_argument_group('SequenceTaggingPipeline')
+        parser.add_argument("--max_seq_length", default=512, type=int)
+        parser = _taskModel.add_model_specific_args(parent_args)
+        parser = UniversalDataModule.add_data_specific_args(parent_args)
+        parser = UniversalCheckpoint.add_argparse_args(parent_args)
+        parser = pl.Trainer.add_argparse_args(parent_args)
+        parser = add_module_args(parent_args)
+        return parent_args
+
+    def __init__(self,
+                model_path: str = None,
+                args=None,
+                **kwargs):
+
+        _validation_dict={
+            'linear': 'validation_linear',
+            'crf': 'validation_crf',
+            'span': 'validation_span',
+            'biaffine': 'validation_biaffine',
+        }
+
+        _prediction_dict={
+            'linear': 'postprocess_linear',
+            'crf': 'postprocess_crf',
+            'span': 'postprocess_span',
+            'biaffine': 'postprocess_biaffine',
+        }
+
+        self.args = args
+        self.model_name=args.model_type+"-"+args.decode_type
+
+        self.label2id,self.id2label=DataProcessor.get_labels(args)
+
+        self.config=BertConfig.from_pretrained(model_path)
+        self.model = _model_dict[self.model_name].from_pretrained(model_path, config=self.config, num_labels=len(self.label2id), loss_type=args.loss_type)
+        self.tokenizer=BertTokenizer.from_pretrained(model_path)
+        self.validate_fn = _validation_dict[args.decode_type]
+        self.predict_fn = getattr(self,_prediction_dict[args.decode_type])
+
+        self.collator = _collator_dict[args.decode_type]()
+        self.collator.args=self.args
+        self.collator.tokenizer=self.tokenizer
+        self.collator.label2id=self.label2id
+
+        device=-1
+        super().__init__(model=self.model,
+                         tokenizer=self.tokenizer,
+                         framework='pt',
+                         device=device,
+                         **kwargs)
+
+    def check_model_type(self, supported_models: Union[List[str], dict]):
+        pass
+
+    def train(self):
+
+        datasets=get_datasets(self.args)
+        
+        checkpoint_callback = UniversalCheckpoint(self.args).callbacks
+        lr_monitor = LearningRateMonitor(logging_interval='step')
+
+        trainer = pl.Trainer.from_argparse_args(self.args,
+                                            callbacks=[checkpoint_callback, lr_monitor]
+                                            )
+
+        data_model = UniversalDataModule(
+                    datasets=datasets,
+                    args=self.args,
+                    collate_fn=self.collator,
+                    tokenizer=self.tokenizer)
+        model = _taskModel(self.args,self.model,self.label2id,self.validate_fn)
+
+        trainer.fit(model,data_model)
+
+    def _forward(self, model_inputs):
+        outputs = self.model(**model_inputs)
+        return (model_inputs,outputs)
+
+    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
+        samples=[]
+        labels,subject=["O" for _ in range(len(inputs))],[]
+        samples.append({"text_a": list(inputs), "labels": labels, "subject":subject})
+        return self.collator(samples)
+
+    def postprocess(self, model_outputs):
+        return self.predict_fn(model_outputs)
+
+    def postprocess_linear(self, model_outputs):
+        model_inputs,outputs=model_outputs
+        preds = torch.argmax(F.log_softmax(outputs.logits, dim=2), dim=2)
+        preds = preds.detach().cpu().numpy()
+        text = self.tokenizer.convert_ids_to_tokens(model_inputs['input_ids'][0])[:model_inputs['input_len'][0]][1:-1]
+        pred = preds[0][:model_inputs['input_len'][0]][1:-1]
+        label_entities = get_entities(pred, self.id2label)
+        for label_list in label_entities:
+            label_list.append("".join(text[label_list[1]:label_list[2]+1]))
+
+        return label_entities
+    
+    def postprocess_crf(self, model_outputs):
+        model_inputs,outputs=model_outputs
+        preds = self.model.crf.decode(outputs.logits, model_inputs['attention_mask']).squeeze(0).cpu().numpy().tolist()
+        text = self.tokenizer.convert_ids_to_tokens(model_inputs['input_ids'][0])[:model_inputs['input_len'][0]][1:-1]
+        pred = preds[0][:model_inputs['input_len'][0]][1:-1]
+        label_entities = get_entities(pred, self.id2label)
+        for label_list in label_entities:
+            label_list.append("".join(text[label_list[1]:label_list[2]+1]))
+
+        return label_entities
+    
+    def postprocess_span(self, model_outputs):
+        model_inputs,outputs=model_outputs
+
+        start_logits, end_logits = outputs.start_logits[0], outputs.end_logits[0]
+        text = self.tokenizer.convert_ids_to_tokens(model_inputs['input_ids'][0])[:model_inputs['input_len'][0]][1:-1]
+        R = bert_extract_item(start_logits[:model_inputs['input_len'][0]], end_logits[:model_inputs['input_len'][0]])
+        label_entities = [[self.id2label[x[0]],x[1],x[2],"".join(text[x[1]:x[2]+1])] for x in R]
+
+        return label_entities
+
+
+Pipeline = SequenceTaggingPipeline
diff --git a/fengshen/pipelines/tcbert.py b/fengshen/pipelines/tcbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fecdd7cd77c64c82e9b168aaae82d85ec438801
--- /dev/null
+++ b/fengshen/pipelines/tcbert.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2021 The IDEA Authors. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import basicConfig
+import torch
+from torch import nn
+import json
+from tqdm import tqdm
+import os
+import numpy as np
+from transformers import BertTokenizer
+import pytorch_lightning as pl
+
+from pytorch_lightning import trainer, loggers
+from transformers import AutoConfig
+from transformers.pipelines.base import Pipeline
+import argparse
+import copy
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import warnings
+from fengshen.models.tcbert.modeling_tcbert import (
+    TCBertDataModel,
+    TCBertLitModel,
+    TCBertPredict,
+)
+
+
+class TCBertPipelines(Pipeline):
+    @staticmethod
+    def piplines_args(parent_args):
+        total_parser = parent_args.add_argument_group("piplines args")
+        total_parser.add_argument(
+            '--pretrained_model_path', default='', type=str)
+        total_parser.add_argument('--load_checkpoints_path',
+                                  default='', type=str)
+        total_parser.add_argument('--train', action='store_true')
+        total_parser.add_argument('--language',
+                                  default='chinese', type=str)
+
+        total_parser = TCBertDataModel.add_data_specific_args(total_parser)
+        total_parser = UniversalCheckpoint.add_argparse_args(total_parser)
+        total_parser = TCBertLitModel.add_model_specific_args(total_parser)
+        total_parser = pl.Trainer.add_argparse_args(parent_args)
+        return parent_args
+
+    def __init__(self, args, model_path, nlabels):
+        self.args = args
+        self.checkpoint_callback = UniversalCheckpoint(args)
+        self.logger = loggers.TensorBoardLogger(save_dir=args.default_root_dir)
+        self.trainer = pl.Trainer.from_argparse_args(args,
+                                                     logger=self.logger,
+                                                     callbacks=[self.checkpoint_callback])
+        self.config = AutoConfig.from_pretrained(model_path)
+        self.tokenizer = BertTokenizer.from_pretrained(
+            model_path)
+
+        if args.load_checkpoints_path != '':
+            self.model = TCBertLitModel.load_from_checkpoint(
+                args.load_checkpoints_path, args=args, model_path=model_path, nlabels=nlabels)
+            print('load model from: ', args.load_checkpoints_path)
+        else:
+            self.model = TCBertLitModel(
+                args, model_path=model_path, nlabels=nlabels)
+
+    def train(self, train_data, dev_data, prompt, prompt_label):
+        
+        data_model = TCBertDataModel(
+            train_data, dev_data, self.tokenizer, self.args, prompt, prompt_label)
+        self.model.num_data = len(train_data)
+        self.trainer.fit(self.model, data_model)
+
+    def predict(self, test_data, prompt, prompt_label, cuda=True):
+    
+        result = []
+        start = 0
+        if cuda:
+            self.model = self.model.cuda()
+        self.model.model.eval()
+        predict_model = TCBertPredict(self.model, self.tokenizer, self.args, prompt, prompt_label)
+        while start < len(test_data):
+            batch_data = test_data[start:start+self.args.batchsize]
+            start += self.args.batchsize
+            batch_result = predict_model.predict(batch_data)
+            result.extend(batch_result)
+        # result = self.postprocess(result)
+        return result
+
+
+    def preprocess(self, data):
+        return data
+
+    def postprocess(self, data):
+        return data
+
+    
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+        # "No top_k"
+        preprocess_params = tokenizer_kwargs
+
+        postprocess_params = {}
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+
+        if isinstance(top_k, int) or top_k is None:
+            postprocess_params["top_k"] = top_k
+            postprocess_params["_legacy"] = False
+        elif return_all_scores is not None:
+            warnings.warn(
+                "`return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of"
+                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+                UserWarning,
+            )
+            if return_all_scores:
+                postprocess_params["top_k"] = None
+            else:
+                postprocess_params["top_k"] = 1
+
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
diff --git a/fengshen/pipelines/test.py b/fengshen/pipelines/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..251f38ebbcdd5fb0fbb1660a5b4bce1269a14a74
--- /dev/null
+++ b/fengshen/pipelines/test.py
@@ -0,0 +1,20 @@
+from fengshen.pipelines.text_classification import TextClassificationPipeline
+import argparse
+from datasets import load_dataset
+
+
+# 预测 支持批量
+# pipe = TextClassificationPipeline(
+#     model='/data/gaoxinyu/pretrained_model/deberta-base-sp', device=-1)
+# print(pipe(['今天心情不好</s>今天很开心', '今天心情很好</s>今天很开心']))
+
+# 训练 支持各种超参调整
+total_parser = argparse.ArgumentParser("test")
+total_parser = TextClassificationPipeline.add_pipeline_specific_args(total_parser)
+args = total_parser.parse_args()
+args.gpus=2
+datasets = load_dataset('IDEA-CCNL/AFQMC')
+pipe = TextClassificationPipeline(
+    args=args,
+    model='/cognitive_comp/lujunyu/XinYu/Fengshenbang-LM/fengshen/workspace/bert-base/pretrain', device=-1)
+pipe.train(datasets)
diff --git a/fengshen/pipelines/test_tagging.py b/fengshen/pipelines/test_tagging.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9d8804f3bb27ae8e5bbbc93beb53e07da9b8fc
--- /dev/null
+++ b/fengshen/pipelines/test_tagging.py
@@ -0,0 +1,22 @@
+from fengshen.pipelines.sequence_tagging import SequenceTaggingPipeline
+import argparse
+import os
+
+total_parser = argparse.ArgumentParser("test")
+total_parser = SequenceTaggingPipeline.add_pipeline_specific_args(total_parser)
+args = total_parser.parse_args()
+args.data_dir="/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo"
+args.gpus=2
+args.max_epochs=30
+args.decode_type='linear'
+args.learning_rate=3e-5
+args.strategy="deepspeed_stage_1"
+
+os.environ["CUDA_VISIBLE_DEVICES"]="5,6"
+# pipe = SequenceTaggingPipeline(
+#     model_path='/cognitive_comp/lujunyu/NER/outputs/ccks_crf/bert/best_checkpoint', args=args)
+# print(pipe('李开复的哥哥在中国共产党读书。'))
+
+pipe = SequenceTaggingPipeline(
+    model_path='/cognitive_comp/lujunyu/XinYu/Fengshenbang-LM/fengshen/workspace/bert-base/pretrain', args=args)
+pipe.train()
diff --git a/fengshen/pipelines/text_classification.py b/fengshen/pipelines/text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..c236bd9a354896c0fada947ba12206cc78ebc99f
--- /dev/null
+++ b/fengshen/pipelines/text_classification.py
@@ -0,0 +1,234 @@
+import torch
+from torch.utils.data._utils.collate import default_collate
+from dataclasses import dataclass
+from typing import Dict, List
+from .base import (
+    _CONFIG_MODEL_TYPE,
+    _CONFIG_TOKENIZER_TYPE)
+from fengshen.models.roformer import RoFormerForSequenceClassification
+from fengshen.models.longformer import LongformerForSequenceClassification
+from fengshen.models.zen1 import ZenForSequenceClassification
+from transformers import (
+    BertConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+from transformers.pipelines.base import PipelineException, GenericTensor
+from transformers import TextClassificationPipeline as HuggingfacePipe
+import pytorch_lightning as pl
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+from fengshen.models.model_utils import add_module_args
+import torchmetrics
+
+_model_dict = {
+    'fengshen-roformer': RoFormerForSequenceClassification,
+    # 'fengshen-megatron_t5': T5EncoderModel,  TODO 实现T5EncoderForSequenceClassification
+    'fengshen-longformer': LongformerForSequenceClassification,
+    'fengshen-zen1': ZenForSequenceClassification,
+    'huggingface-auto': AutoModelForSequenceClassification,
+}
+
+_tokenizer_dict = {}
+
+_ATTR_PREPARE_INPUT = '_prepare_inputs_for_sequence_classification'
+
+
+class _taskModel(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_args):
+        _ = parent_args.add_argument_group('text classification task model')
+        return parent_args
+
+    def __init__(self, args, model):
+        super().__init__()
+        self.model = model
+        self.acc_metrics = torchmetrics.Accuracy()
+        self.save_hyperparameters(args)
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
+            # Calculate total steps
+            if self.trainer.max_epochs > 0:
+                world_size = self.trainer.world_size
+                tb_size = self.hparams.train_batchsize * max(1, world_size)
+                ab_size = self.trainer.accumulate_grad_batches
+                self.total_steps = (len(train_loader.dataset) *
+                                    self.trainer.max_epochs // tb_size) // ab_size
+            else:
+                self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches
+
+            print('Total steps: {}' .format(self.total_steps))
+
+    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss, _ = outputs[0], outputs[1]
+        self.log('train_loss', loss)
+        return loss
+
+    def comput_metrix(self, logits, labels):
+        y_pred = torch.argmax(logits, dim=-1)
+        y_pred = y_pred.view(size=(-1,))
+        y_true = labels.view(size=(-1,)).long()
+        acc = self.acc_metrics(y_pred.long(), y_true.long())
+        return acc
+
+    def validation_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss, logits = outputs[0], outputs[1]
+        acc = self.comput_metrix(logits, batch['labels'])
+        self.log('val_loss', loss)
+        self.log('val_acc', acc)
+
+    def predict_step(self, batch, batch_idx):
+        output = self.model(**batch)
+        return output.logits
+
+    def configure_optimizers(self):
+        from fengshen.models.model_utils import configure_optimizers
+        return configure_optimizers(self)
+
+
+@dataclass
+class _Collator:
+    tokenizer = None
+    texta_name = 'sentence'
+    textb_name = 'sentence2'
+    label_name = 'label'
+    max_length = 512
+    model_type = 'huggingface-auto'
+
+    def __call__(self, samples):
+        sample_list = []
+        for item in samples:
+            if self.textb_name in item and item[self.textb_name] != '':
+                if self.model_type != 'fengshen-roformer':
+                    encode_dict = self.tokenizer.encode_plus(
+                        [item[self.texta_name], item[self.textb_name]],
+                        max_length=self.max_length,
+                        padding='max_length',
+                        truncation='longest_first')
+                else:
+                    encode_dict = self.tokenizer.encode_plus(
+                        [item[self.texta_name]+'[SEP]'+item[self.textb_name]],
+                        max_length=self.max_length,
+                        padding='max_length',
+                        truncation='longest_first')
+            else:
+                encode_dict = self.tokenizer.encode_plus(
+                    item[self.texta_name],
+                    max_length=self.max_length,
+                    padding='max_length',
+                    truncation='longest_first')
+            sample = {}
+            for k, v in encode_dict.items():
+                sample[k] = torch.tensor(v)
+            if self.label_name in item:
+                sample['labels'] = torch.tensor(item[self.label_name]).long()
+            sample_list.append(sample)
+        return default_collate(sample_list)
+
+
+class TextClassificationPipeline(HuggingfacePipe):
+    @staticmethod
+    def add_pipeline_specific_args(parent_args):
+        parser = parent_args.add_argument_group('SequenceClassificationPipeline')
+        parser.add_argument('--texta_name', default='sentence', type=str)
+        parser.add_argument('--textb_name', default='sentence2', type=str)
+        parser.add_argument('--label_name', default='label', type=str)
+        parser.add_argument('--max_length', default=512, type=int)
+        parser.add_argument('--device', default=-1, type=int)
+        parser = _taskModel.add_model_specific_args(parent_args)
+        parser = UniversalDataModule.add_data_specific_args(parent_args)
+        parser = UniversalCheckpoint.add_argparse_args(parent_args)
+        parser = pl.Trainer.add_argparse_args(parent_args)
+        parser = add_module_args(parent_args)
+        return parent_args
+
+    def __init__(self,
+                 model: str = None,
+                 args=None,
+                 **kwargs):
+        self.args = args
+        self.model_name = model
+        self.model_type = 'huggingface-auto'
+        # 用BertConfig做兼容，我只需要读里面的fengshen_model_type，所以这里用啥Config都可以
+        config = BertConfig.from_pretrained(model)
+        if hasattr(config, _CONFIG_MODEL_TYPE):
+            self.model_type = config.fengshen_model_type
+        if self.model_type not in _model_dict:
+            raise PipelineException(self.model_name, ' not in model type dict')
+        # 加载模型，并且使用模型的config
+        self.model = _model_dict[self.model_type].from_pretrained(model)
+        self.config = self.model.config
+        # 加载分词
+        tokenizer_config = get_tokenizer_config(model, **kwargs)
+        self.tokenizer = None
+        if hasattr(tokenizer_config, _CONFIG_TOKENIZER_TYPE):
+            if tokenizer_config._CONFIG_TOKENIZER_TYPE in _tokenizer_dict:
+                self.tokenizer = _tokenizer_dict[tokenizer_config._CONFIG_TOKENIZER_TYPE].from_pretrained(
+                    model)
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(model)
+        # 加载数据处理模块
+        c = _Collator()
+        c.tokenizer = self.tokenizer
+        c.model_type = self.model_type
+        if args is not None:
+            c.texta_name = self.args.texta_name
+            c.textb_name = self.args.textb_name
+            c.label_name = self.args.label_name
+            c.max_length = self.args.max_length
+        self.collator = c
+        device = -1 if args is None else args.device
+        print(device)
+        print(kwargs)
+        super().__init__(model=self.model,
+                         tokenizer=self.tokenizer,
+                         framework='pt',
+                         device=device,
+                         **kwargs)
+
+    def train(self,
+              datasets: Dict):
+        """
+        Args:
+            datasets is a dict like
+            {
+                test: Dataset()
+                validation: Dataset()
+                train: Dataset()
+            }
+        """
+        checkpoint_callback = UniversalCheckpoint(self.args)
+        trainer = pl.Trainer.from_argparse_args(self.args,
+                                                callbacks=[checkpoint_callback]
+                                                )
+
+        data_model = UniversalDataModule(
+            datasets=datasets,
+            tokenizer=self.tokenizer,
+            collate_fn=self.collator,
+            args=self.args)
+        model = _taskModel(self.args, self.model)
+
+        trainer.fit(model, data_model)
+        return
+
+    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
+        # 如果模型有自定义的接口，用模型的口
+        if hasattr(self.model, _ATTR_PREPARE_INPUT):
+            return getattr(self.model, _ATTR_PREPARE_INPUT)(inputs, self.tokenizer, **tokenizer_kwargs)
+        samples = []
+        if isinstance(inputs, str):
+            samples.append({self.collator.texta_name: inputs})
+        else:
+            # 在__call__里面已经保证了input的类型，所以这里直接else就行
+            for i in inputs:
+                samples.append({self.collator.texta_name})
+        return self.collator(samples)
+
+
+Pipeline = TextClassificationPipeline
diff --git a/fengshen/requirement.txt b/fengshen/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4ee69068ca4c3fd0e51337df1006908e9ed1bc83
--- /dev/null
+++ b/fengshen/requirement.txt
@@ -0,0 +1,8 @@
+transformers>=4.17.0
+datasets>=2.0.0
+pytorch_lightning==1.6.3
+deepspeed==0.5.10
+jieba-fast>=0.53
+jieba>=0.40.0
+protobuf==3.20.1
+
diff --git a/fengshen/tokenizer/sentencepiece/pretrain_google_sp.sh b/fengshen/tokenizer/sentencepiece/pretrain_google_sp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e7dd39f59dac0314a9b285c02f05156fda67e622
--- /dev/null
+++ b/fengshen/tokenizer/sentencepiece/pretrain_google_sp.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+#SBATCH --job-name=google_sp
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=100
+#SBATCH --ntasks-per-node=1
+#SBATCH -o %x-%j.log
+
+set -x -e
+
+echo "START TIME: $(date)"
+
+BIN_PATH=/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/bin/spm_train
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/lib
+INPUT_FILE=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_59132213.txt
+INPUT_FILE_SMALL=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_1000000.txt
+
+
+VOCAB_SIZE=40000
+COV=0.9995
+MAX_LENGTH=6
+TYPE=bpe
+SEED=42
+MAX_INPUT_LENGTH=100000
+
+OPTION="\
+    --input=${INPUT_FILE} \
+    --vocab_size=${VOCAB_SIZE} \
+    --character_coverage=${COV} \
+    --max_sentencepiece_length=${MAX_LENGTH} \
+    --model_type=${TYPE} \
+    --model_prefix=${TYPE}_v${VOCAB_SIZE}_s${SEED}_cov${COV}_max${MAX_LENGTH} \
+    --random_seed=${SEED} \
+    --max_sentence_length=100000 \
+    --shuffle_input_sentence=true \
+    --input_sentence_size=${MAX_INPUT_LENGTH} \
+    --minloglevel 1 \
+    --num_threads=100 \
+    --train_extremely_large_corpus=true \
+    "
+
+eval $BIN_PATH $OPTION
\ No newline at end of file
diff --git a/fengshen/tokenizer/sentencepiece/shuffle_corpus.py b/fengshen/tokenizer/sentencepiece/shuffle_corpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b3bdf1fc55f3bdd78ca5d540f80d5b612188b68
--- /dev/null
+++ b/fengshen/tokenizer/sentencepiece/shuffle_corpus.py
@@ -0,0 +1,18 @@
+import sys
+import os
+from tqdm import tqdm
+sys.path.append('../../')
+
+if __name__ == '__main__':
+    from data.fs_datasets import load_dataset
+    dataset = load_dataset('wudao_180g', num_proc=100)
+    print('dataset loaded', flush=True)
+
+    shuffle_ds = dataset['train'].shuffle(seed=42, writer_batch_size=1000)
+    print('dataset shuffled', flush=True)
+    need_size = len(shuffle_ds)
+
+    f = open('shuffle_corpus_{}.txt'.format(need_size), 'w', encoding='utf-8')
+    for i in tqdm(range(0, need_size)):
+        f.write(shuffle_ds[i]['text'] + os.linesep)
+    f.close()
diff --git a/fengshen/tokenizer/tokenizer.py b/fengshen/tokenizer/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb
--- /dev/null
+++ b/fengshen/tokenizer/tokenizer.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/fengshen/utils/__init__.py b/fengshen/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d5edde9004de0bb474d3ae204e9049c0108cea4
--- /dev/null
+++ b/fengshen/utils/__init__.py
@@ -0,0 +1,4 @@
+from .universal_checkpoint import UniversalCheckpoint
+from .utils import chinese_char_tokenize
+from .transfo_xl_utils import top_k_logits, sample_sequence_batch, sample_sequence, get_masks_and_position_ids
+__all__ = ['UniversalCheckpoint', 'chinese_char_tokenize', 'top_k_logits', 'sample_sequence_batch', 'sample_sequence', 'get_masks_and_position_ids']
diff --git a/fengshen/utils/convert_diffusers_to_original_stable_diffusion.py b/fengshen/utils/convert_diffusers_to_original_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..8515468119ada70a1c6db0f5c6a6ee91c8a7824f
--- /dev/null
+++ b/fengshen/utils/convert_diffusers_to_original_stable_diffusion.py
@@ -0,0 +1,235 @@
+# coding=utf8
+# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
+# *Only* converts the UNet, VAE, and Text Encoder.
+# Does not convert optimizer state or any other thing.
+
+import argparse
+import os.path as osp
+
+import torch
+
+
+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+vae_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("nin_shortcut", "conv_shortcut"),
+    ("norm_out", "conv_norm_out"),
+    ("mid.attn_1.", "mid_block.attentions.0."),
+]
+
+for i in range(4):
+    # down_blocks have two resnets
+    for j in range(2):
+        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+        sd_down_prefix = f"encoder.down.{i}.block.{j}."
+        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+
+    if i < 3:
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+        sd_downsample_prefix = f"down.{i}.downsample."
+        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"up.{3-i}.upsample."
+        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+
+    # up_blocks have three resnets
+    # also, up blocks in hf are numbered in reverse from sd
+    for j in range(3):
+        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+
+# this part accounts for mid blocks in both the encoder and the decoder
+for i in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{i}."
+    sd_mid_res_prefix = f"mid.block_{i+1}."
+    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+vae_conversion_map_attn = [
+    # (stable-diffusion, HF Diffusers)
+    ("norm.", "group_norm."),
+    ("q.", "query."),
+    ("k.", "key."),
+    ("v.", "value."),
+    ("proj_out.", "proj_attn."),
+]
+
+
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
+
+
+def convert_vae_state_dict(vae_state_dict):
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+
+
+# =========================#
+# Text Encoder Conversion #
+# =========================#
+# pretty much a no-op
+
+# here we need transform it to support
+def convert_text_enc_state_dict(text_enc_dict):
+    return text_enc_dict
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default='', type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument("--checkpoint_path", default='', type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+
+    args = parser.parse_args()
+
+    assert args.model_path is not None, "Must provide a model path!"
+
+    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
+
+    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
+    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
+    text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
+
+    # Convert the UNet model
+    unet_state_dict = torch.load(unet_path, map_location="cpu")
+    unet_state_dict = convert_unet_state_dict(unet_state_dict)
+    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
+
+    # Convert the VAE model
+    vae_state_dict = torch.load(vae_path, map_location="cpu")
+    vae_state_dict = convert_vae_state_dict(vae_state_dict)
+    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
+
+    # Convert the text encoder model
+    text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+    text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
+    text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
+
+    # Put together new checkpoint
+    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
+    if args.half:
+        state_dict = {k: v.half() for k, v in state_dict.items()}
+    state_dict = {"state_dict": state_dict}
+    torch.save(state_dict, args.checkpoint_path)
diff --git a/fengshen/utils/convert_py_to_npy.py b/fengshen/utils/convert_py_to_npy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d652169b59ffdc7ca977318ee72187b2ce73c1f
--- /dev/null
+++ b/fengshen/utils/convert_py_to_npy.py
@@ -0,0 +1,54 @@
+import argparse
+import torch
+import glob
+import os
+import numpy as np
+
+
+class MMapIndexDataset():
+    def __init__(self, datapath):
+        self.idxfp = np.load(datapath + '.npy', mmap_mode='r')
+        self.binfp = np.memmap(datapath + '.bin', dtype='long', mode='r')
+
+    def __len__(self):
+        return self.idxfp.shape[0]
+
+    def __getitem__(self, idx):
+        return self.binfp[self.idxfp[idx, 0]:self.idxfp[idx, 1]]
+
+
+def convert_py_to_npy(input_tensor, bin_out, idx_out):
+    idx = torch.empty(len(input_tensor), 2, dtype=torch.long)
+    start = 0
+    for i, input in enumerate(input_tensor):
+        idx[i] = torch.tensor([start, start + len(input)])
+        start += len(input)
+    np.save(idx_out, idx)
+    binfp = np.memmap(bin_out, dtype='long', mode='w+', shape=(start))
+    start = 0
+    for i, input in enumerate(input_tensor):
+        for j, idx in enumerate(input):
+            binfp[start + j] = idx
+        start += len(input)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Text infilling.")
+    parser.add_argument('--data_path', type=str,
+                        default='/cognitive_comp/gaoxinyu/data/wudao')
+    args = parser.parse_args()
+    process_key = [
+        'incorrect_input_ids_list',
+        'label_ids_list',
+        'target_ids_list',
+    ]
+    if os.path.exists(args.data_path):
+        print(f'''Loading data from {args.data_path}''')
+        data_dict = torch.load(args.data_path)
+        for k in process_key:
+            bin_out = ('_' + k + '.bin').join(args.data_path.rsplit('.pt', 1))
+            idx_out = ('_' + k).join(args.data_path.rsplit('.pt', 1))
+            convert_py_to_npy(data_dict[k], bin_out, idx_out)
+    else:
+        print(
+            f'Please create the synthetic datafile {args.data_path} with create_synthetic_data.py.')
diff --git a/fengshen/utils/convert_tf_checkpoint_to_pytorch.py b/fengshen/utils/convert_tf_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..c322a872c1969acf91866ad59ab23e7cd477f0d0
--- /dev/null
+++ b/fengshen/utils/convert_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,63 @@
+"""Convert ALBERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+# from models.transformers.modeling_albert_bright import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = BertConfig.from_pretrained(bert_config_file)
+    # print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = BertForPreTraining(config)
+    # Load weights from tf checkpoint
+    load_tf_weights_in_bert(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--bert_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,args.bert_config_file,
+                                     args.pytorch_dump_path)
+
+'''
+# google
+python convert_albert_tf_checkpoint_to_pytorch.py \
+    --tf_checkpoint_path=./prev_trained_model/albert_large_zh \
+    --bert_config_file=./prev_trained_model/albert_large_zh/config.json \
+    --pytorch_dump_path=./prev_trained_model/albert_large_zh/pytorch_model.bin
+    
+# bright
+from model.modeling_albert_bright import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
+python convert_albert_tf_checkpoint_to_pytorch.py \
+    --tf_checkpoint_path=./prev_trained_model/albert_base_bright \
+    --bert_config_file=./prev_trained_model/albert_base_bright/config.json \
+    --pytorch_dump_path=./prev_trained_model/albert_base_bright/pytorch_model.bin
+'''
\ No newline at end of file
diff --git a/fengshen/utils/huggingface_spider.py b/fengshen/utils/huggingface_spider.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dd5a4eae3e2a046b346fc465fc13f4feff28c22
--- /dev/null
+++ b/fengshen/utils/huggingface_spider.py
@@ -0,0 +1,16 @@
+import json
+import requests
+from bs4 import BeautifulSoup
+
+response = requests.get('https://huggingface.co./IDEA-CCNL?sort_models=downloads#models')
+soup = BeautifulSoup(response.content, 'html.parser')
+model_data_node = soup.find_all('div', attrs={"class": "SVELTE_HYDRATER"})[3]
+data = json.loads(model_data_node['data-props'])
+all_downloads = 0
+for item in data['repos']:
+    if 'downloads' not in item:
+        item['downloads'] = 0
+    all_downloads += item['downloads']
+    print('name: {}, author: {}, downloads: {}, likes: {}'.format(
+        item['id'], item['author'], item['downloads'], item['likes']))
+print('total downloads {}'.format(all_downloads))
diff --git a/fengshen/utils/transfo_xl_utils.py b/fengshen/utils/transfo_xl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9082c394680c1e22a4510b30e82f5c3c92a3de61
--- /dev/null
+++ b/fengshen/utils/transfo_xl_utils.py
@@ -0,0 +1,250 @@
+# encoding=utf-8
+import torch, math
+import torch.nn.functional as F
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p > 0.0:
+        # convert to 1D
+        sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        for i in range(sorted_indices.size()[0]):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+    return logits
+
+
+def enforce_repetition_penalty(lprobs, prev_output_tokens, repetition_penalty=1.5):
+    """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+    for previous_token in set(prev_output_tokens):
+        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+        if lprobs[previous_token] < 0:
+            lprobs[previous_token] *= repetition_penalty
+        else:
+            lprobs[previous_token] /= repetition_penalty
+
+
+def switch(next_value, init, is_update):  # 换成真实token
+    is_update = is_update.type_as(next_value)
+    return (1-is_update)*init + is_update*next_value
+
+
+def get_atten_mask(batch_size, seq_length, memory_length=0):
+    memory_attention_mask = torch.ones(
+        (batch_size, 1, seq_length, seq_length + memory_length), dtype=torch.int16)
+    memory_attention_mask = torch.tril(
+        torch.triu(memory_attention_mask, 1 - seq_length + memory_length), memory_length)
+
+    return memory_attention_mask  # [bs, 1, seq_len, seq_len+M]
+
+
+def get_masks_and_position_ids(data, mem_length=None):
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+    # Attention mask (lower triangular).
+    attention_mask = torch.ones((1, seq_length, seq_length + mem_length), device=data.device)
+    attention_mask = torch.tril(torch.triu(attention_mask, 1 - seq_length + mem_length), mem_length)
+    attention_mask = attention_mask.unsqueeze(1)
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    return attention_mask, position_ids
+
+
+def sample_sequence_batch(model, context_tokens_tensor, context_length_tensor, max_out_seq=None, mems=None,
+                          end_token_id=None, repetition_penalty=1.0, temperature=1.0, top_k=0, top_p=0.0):
+    """_summary_
+
+    Args:
+        model (_type_): _description_
+        context_tokens_tensor (Tensor): [bs, seq_len]
+        context_length_tensor (Tensor): [bs, ]
+        max_out_seq (_type_, optional): _description_. Defaults to None.
+        mems (_type_, optional): _description_. Defaults to None.
+        end_token_id (_type_, optional): _description_. Defaults to None.
+        repetition_penalty (float, optional): _description_. Defaults to 1.0.
+        temperature (float, optional): _description_. Defaults to 1.0.
+        top_k (int, optional): _description_. Defaults to 0.
+        top_p (float, optional): _description_. Defaults to 0.0.
+
+    Returns:
+        _type_: _description_
+    """
+    
+    model_dtype = next(model.parameters()).dtype
+    org_context_length = torch.min(context_length_tensor).item()
+    batch_size = context_tokens_tensor.shape[0]
+    tokens = context_tokens_tensor[:, :org_context_length]
+    attention_mask = get_atten_mask(batch_size, org_context_length).cuda(context_tokens_tensor.device).to(model_dtype)
+    position_ids = torch.arange(org_context_length, dtype=torch.long,
+                                device=tokens.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(tokens)
+
+    counter, mem_length = 0, 0
+    if mems is None:
+        mems = []
+    if end_token_id is None:
+        end_token_id = 50000
+    if max_out_seq is None:
+        max_out_seq = 512
+
+    output_tokens_lists = []
+    
+    # record order
+    origin_order = torch.tensor(range(batch_size), device=tokens.device)
+    output_order = []
+
+    # record log_probs
+    log_probs_tensor = torch.tensor([0.0] * batch_size, device=tokens.device)
+    log_probs_list = []
+
+    with torch.no_grad():
+        # while counter < (max_out_seq - org_context_length):
+        while counter < max_out_seq:
+            index = org_context_length + counter
+            if counter == 0:
+                output = model.forward(input_ids=tokens, position_ids=position_ids, 
+                                              attention_mask=attention_mask, hidden_states=mems)
+                logits, mems = output.logits, output.hidden_states
+            else:
+                output = model.forward(input_ids=tokens[:, index - 1: index], position_ids=tokens.new_ones((1, 1)) * (index - 1), 
+                                              attention_mask=tokens.new_ones(batch_size, 1, 1, mem_length + 1).to(model_dtype), hidden_states=mems)
+                logits, mems = output.logits, output.hidden_states
+            logits = logits[:, -1]
+            logits /= temperature
+            logits = top_k_logits(logits, top_k=top_k, top_p=top_p)
+            # if repetition_penalty != 1.0:
+            #     for bz in range(batch_size):
+            #         enforce_repetition_penalty(logits[bz, :], tokens[bz, :], repetition_penalty)
+            log_probs = F.softmax(logits, dim=-1)  # [bs, vocab_size]
+
+            # if repetition_penalty != 1.0:
+            #     for bz in range(batch_size):
+            #         enforce_repetition_penalty(
+            #             log_probs[bz, :], tokens[bz, :], repetition_penalty)
+
+            prev = torch.multinomial(log_probs, num_samples=1).view(-1)
+
+            if index < torch.max(context_length_tensor).item():
+                prev = switch(
+                    prev, context_tokens_tensor[:, index], context_length_tensor <= index)
+            
+            for i in range(batch_size):
+                if index > context_length_tensor[i] and prev[i] != end_token_id:
+                    log_probs_tensor[i] += math.log(log_probs[i][prev[i]] + 1e-6) ###
+                if prev[i] == end_token_id:
+                    log_probs_tensor[i] /= (context_length_tensor[i].cpu() - index)
+
+            # with torch.autocast('cpu'):
+            stop_idx = prev == end_token_id
+            if torch.all(stop_idx).item():
+                output_order.extend(origin_order[stop_idx].tolist())
+                break
+
+            finished = tokens[stop_idx]
+            output_tokens_lists.extend(finished.detach().cpu().tolist())
+            log_probs_list.extend(log_probs_tensor[stop_idx].tolist())
+            output_order.extend(origin_order[stop_idx].tolist())
+
+            # continue with non-ending tokens
+            conti_idx = (prev != end_token_id)
+            origin_order = origin_order[conti_idx]
+            tokens, prev = tokens[conti_idx], prev[conti_idx]
+            context_tokens_tensor = context_tokens_tensor[conti_idx]
+            context_length_tensor = context_length_tensor[conti_idx]
+            log_probs_tensor = log_probs_tensor[conti_idx]
+            batch_size = tokens.shape[0]
+            for im in range(len(mems)):
+                mems[im] = mems[im][conti_idx, :, :]
+
+            tokens = torch.cat((tokens, prev.view(batch_size, 1)), dim=-1)
+
+            counter += 1
+
+    output_tokens_lists.extend(tokens.detach().cpu().tolist())
+    log_probs_list.extend(log_probs_tensor.tolist())
+    output_order.extend(origin_order.tolist()) ###
+    output_tokens_lists = [tokens[:tokens.index(
+        end_token_id)] if end_token_id in tokens else tokens for tokens in output_tokens_lists]
+
+    output_tokens_lists = [tokens for _, tokens in sorted(zip(output_order, output_tokens_lists))]
+    output_log_porbs = [prob for _, prob in sorted(zip(output_order, log_probs_list))]
+
+    return output_tokens_lists, output_log_porbs
+
+
+def sample_sequence(model, tokens, attention_mask, do_sampling=True,
+                    repetition_penalty=1.0, max_out_seq=None, mems=None, end_token_id=None,
+                    mem_length=0, temperature=1.0, top_k=0, top_p=0.0):
+    """_summary_
+
+    Args:
+        model (_type_): _description_
+        tokens (Tensor): [1, seq_len]
+        attention_mask (Tensor): [1, 1, seq_len, seq_len]
+        do_sampling (bool, optional): _description_. Defaults to True.
+        repetition_penalty (float, optional): _description_. Defaults to 1.0.
+        max_out_seq (_type_, optional): _description_. Defaults to None.
+        mems (_type_, optional): _description_. Defaults to None.
+        end_token (_type_, optional): _description_. Defaults to None.
+        mem_length (int, optional): _description_. Defaults to 0.
+        temperature (float, optional): _description_. Defaults to 1.0.
+        top_k (int, optional): _description_. Defaults to 0.
+        top_p (float, optional): _description_. Defaults to 0.0.
+
+    Returns:
+        _type_: _description_
+    """
+    counter = 0
+    if mems is None:
+        mems = []
+    if end_token_id is None:
+        end_token_id = 50000
+    if max_out_seq is None:
+        max_out_seq = 512
+    org_context_length = tokens.size(1)
+    with torch.no_grad():
+        # while counter < (max_out_seq - org_context_length):
+        while counter < max_out_seq:
+            if counter == 0:
+                logits, *mems = model(input_ids=tokens, position_ids=None,
+                                      attention_mask=attention_mask, mems=mems)
+            else:
+                index = org_context_length + counter
+                logits, *mems = model(input_ids=tokens[:, index - 1: index], position_ids=None,
+                                      attention_mask=tokens.new_ones(1, 1, 1, mem_length + 1), mems=mems)
+            logits = logits[:, -1]
+            logits /= temperature
+            if do_sampling:
+                logits = top_k_logits(logits, top_k=top_k, top_p=top_p)
+            log_probs = F.softmax(logits, dim=-1)
+
+            if repetition_penalty != 1.0:
+                enforce_repetition_penalty(
+                    log_probs[0, :], tokens[0, :], repetition_penalty)
+            prev = torch.multinomial(log_probs, num_samples=1)[0]
+            is_end = (prev == end_token_id)
+            if is_end:
+                break
+            tokens = torch.cat((tokens, prev.view(1, 1)), dim=1)
+            counter += 1
+
+    output_tokens_list = tokens.detach().cpu().tolist()
+    if end_token_id in output_tokens_list:
+        output_tokens_list = output_tokens_list[:output_tokens_list.index(
+            end_token_id)]
+
+    return output_tokens_list[0], mems
diff --git a/fengshen/utils/universal_checkpoint.py b/fengshen/utils/universal_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff19ccbb6346507f4200c06de38efd4618eb1840
--- /dev/null
+++ b/fengshen/utils/universal_checkpoint.py
@@ -0,0 +1,41 @@
+from pytorch_lightning.callbacks import ModelCheckpoint
+import os
+
+
+class UniversalCheckpoint(ModelCheckpoint):
+    @staticmethod
+    def add_argparse_args(parent_args):
+        parser = parent_args.add_argument_group('universal checkpoint callback')
+
+        parser.add_argument('--monitor', default='step', type=str)
+        parser.add_argument('--mode', default='max', type=str)
+        parser.add_argument('--save_ckpt_path', default='./ckpt/', type=str)
+        parser.add_argument('--load_ckpt_path', default='./ckpt/', type=str)
+        parser.add_argument(
+            '--filename', default='model-ep{epoch:02d}-st{step:d}', type=str)
+        parser.add_argument('--save_last', action='store_true', default=False)
+        parser.add_argument('--save_top_k', default=10, type=float)
+        parser.add_argument('--every_n_train_steps', default=None, type=float)
+        parser.add_argument('--save_weights_only', action='store_true', default=False)
+        parser.add_argument('--every_n_epochs', default=None, type=int)
+        parser.add_argument('--save_on_train_epoch_end', action='store_true', default=None)
+
+        return parent_args
+
+    def __init__(self, args):
+        super().__init__(monitor=args.monitor,
+                         save_top_k=args.save_top_k,
+                         mode=args.mode,
+                         every_n_train_steps=args.every_n_train_steps,
+                         save_weights_only=args.save_weights_only,
+                         dirpath=args.save_ckpt_path,
+                         filename=args.filename,
+                         save_last=args.save_last,
+                         every_n_epochs=args.every_n_epochs,
+                         save_on_train_epoch_end=args.save_on_train_epoch_end)
+
+        # 做兼容，如果目录不存在的话把这个参数去掉，不然会报错
+        if args.load_ckpt_path is not None and \
+                not os.path.exists(args.load_ckpt_path):
+            print('--------warning no checkpoint found--------, remove args')
+            args.load_ckpt_path = None
diff --git a/fengshen/utils/utils.py b/fengshen/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a03fb0b3326f8f6dce069649197f6b219edab90c
--- /dev/null
+++ b/fengshen/utils/utils.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+import jieba
+import torch
+
+
+def jieba_tokenize(str):
+    return jieba.lcut(str)
+
+
+_UCODE_RANGES = (
+    ("\u3400", "\u4db5"),  # CJK Unified Ideographs Extension A, release 3.0
+    ("\u4e00", "\u9fa5"),  # CJK Unified Ideographs, release 1.1
+    ("\u9fa6", "\u9fbb"),  # CJK Unified Ideographs, release 4.1
+    ("\uf900", "\ufa2d"),  # CJK Compatibility Ideographs, release 1.1
+    ("\ufa30", "\ufa6a"),  # CJK Compatibility Ideographs, release 3.2
+    ("\ufa70", "\ufad9"),  # CJK Compatibility Ideographs, release 4.1
+    ("\u20000", "\u2a6d6"),  # (UTF16) CJK Unified Ideographs Extension B, release 3.1
+    ("\u2f800", "\u2fa1d"),  # (UTF16) CJK Compatibility Supplement, release 3.1
+    ("\uff00", "\uffef"),  # Full width ASCII, full width of English punctuation,
+    # half width Katakana, half wide half width kana, Korean alphabet
+    ("\u2e80", "\u2eff"),  # CJK Radicals Supplement
+    ("\u3000", "\u303f"),  # CJK punctuation mark
+    ("\u31c0", "\u31ef"),  # CJK stroke
+    ("\u2f00", "\u2fdf"),  # Kangxi Radicals
+    ("\u2ff0", "\u2fff"),  # Chinese character structure
+    ("\u3100", "\u312f"),  # Phonetic symbols
+    ("\u31a0", "\u31bf"),  # Phonetic symbols (Taiwanese and Hakka expansion)
+    ("\ufe10", "\ufe1f"),
+    ("\ufe30", "\ufe4f"),
+    ("\u2600", "\u26ff"),
+    ("\u2700", "\u27bf"),
+    ("\u3200", "\u32ff"),
+    ("\u3300", "\u33ff"),
+)
+
+
+def is_chinese_char(uchar):
+    for start, end in _UCODE_RANGES:
+        if start <= uchar <= end:
+            return True
+    return False
+
+
+def chinese_char_tokenize(line):
+    line = line.strip()
+    line_in_chars = ""
+
+    for char in line:
+        if is_chinese_char(char):
+            line_in_chars += " "
+            line_in_chars += char
+            line_in_chars += " "
+        else:
+            line_in_chars += char
+
+    return line_in_chars
+
+# s = '中国的首都是哪里？1，2，3d回答'
+# print(chinese_char_tokenize(s))
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(
+        torch.cuda.memory_allocated() / mega_bytes)
+    string += ' | max allocated: {}'.format(
+        torch.cuda.max_memory_allocated() / mega_bytes)
+    string += ' | reserved: {}'.format(
+        torch.cuda.memory_reserved() / mega_bytes)
+    string += ' | max reserved: {}'.format(
+        torch.cuda.max_memory_reserved() / mega_bytes)
+    print(string)
diff --git a/fengshen/workspace/bert-base/pretrain/config.json b/fengshen/workspace/bert-base/pretrain/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9704272ecb5878688a2d3a990afd0892aa23efa0
--- /dev/null
+++ b/fengshen/workspace/bert-base/pretrain/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "directionality": "bidi",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "biaffine_size": 256,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "type_vocab_size": 2,
+  "vocab_size": 21128
+}
diff --git a/fengshen/workspace/bert-base/pretrain/vocab.txt b/fengshen/workspace/bert-base/pretrain/vocab.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca4f9781030019ab9b253c6dcb8c7878b6dc87a5
--- /dev/null
+++ b/fengshen/workspace/bert-base/pretrain/vocab.txt
@@ -0,0 +1,21128 @@
+[PAD]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+<S>
+<T>
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+£
+¤
+¥
+§
+©
+«
+®
+°
+±
+²
+³
+µ
+·
+¹
+º
+»
+¼
+×
+ß
+æ
+÷
+ø
+đ
+ŋ
+ɔ
+ə
+ɡ
+ʰ
+ˇ
+ˈ
+ˊ
+ˋ
+ˍ
+ː
+˙
+˚
+ˢ
+α
+β
+γ
+δ
+ε
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+ы
+ь
+я
+і
+ا
+ب
+ة
+ت
+د
+ر
+س
+ع
+ل
+م
+ن
+ه
+و
+ي
+۩
+ก
+ง
+น
+ม
+ย
+ร
+อ
+า
+เ
+๑
+་
+ღ
+ᄀ
+ᄁ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄈ
+ᄉ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅣ
+ᅥ
+ᅦ
+ᅧ
+ᅨ
+ᅩ
+ᅪ
+ᅬ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆺ
+ᆻ
+ᆼ
+ᗜ
+ᵃ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵘ
+‖
+„
+†
+•
+‥
+‧
+ 
+‰
+′
+″
+‹
+›
+※
+‿
+⁄
+ⁱ
+⁺
+ⁿ
+₁
+₂
+₃
+₄
+€
+℃
+№
+™
+ⅰ
+ⅱ
+ⅲ
+ⅳ
+ⅴ
+←
+↑
+→
+↓
+↔
+↗
+↘
+⇒
+∀
+−
+∕
+∙
+√
+∞
+∟
+∠
+∣
+∥
+∩
+∮
+∶
+∼
+∽
+≈
+≒
+≡
+≤
+≥
+≦
+≧
+≪
+≫
+⊙
+⋅
+⋈
+⋯
+⌒
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+⑴
+⑵
+⑶
+⑷
+⑸
+⒈
+⒉
+⒊
+⒋
+ⓒ
+ⓔ
+ⓘ
+─
+━
+│
+┃
+┅
+┆
+┊
+┌
+└
+├
+┣
+═
+║
+╚
+╞
+╠
+╭
+╮
+╯
+╰
+╱
+╳
+▂
+▃
+▅
+▇
+█
+▉
+▋
+▌
+▍
+▎
+■
+□
+▪
+▫
+▬
+▲
+△
+▶
+►
+▼
+▽
+◆
+◇
+○
+◎
+●
+◕
+◠
+◢
+◤
+☀
+★
+☆
+☕
+☞
+☺
+☼
+♀
+♂
+♠
+♡
+♣
+♥
+♦
+♪
+♫
+♬
+✈
+✔
+✕
+✖
+✦
+✨
+✪
+✰
+✿
+❀
+❤
+➜
+➤
+⦿
+、
+。
+〃
+々
+〇
+〈
+〉
+《
+》
+「
+」
+『
+』
+【
+】
+〓
+〔
+〕
+〖
+〗
+〜
+〝
+〞
+ぁ
+あ
+ぃ
+い
+う
+ぇ
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゅ
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+゜
+ゝ
+ァ
+ア
+ィ
+イ
+ゥ
+ウ
+ェ
+エ
+ォ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+ソ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ヌ
+ネ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ヲ
+ン
+ヶ
+・
+ー
+ヽ
+ㄅ
+ㄆ
+ㄇ
+ㄉ
+ㄋ
+ㄌ
+ㄍ
+ㄎ
+ㄏ
+ㄒ
+ㄚ
+ㄛ
+ㄞ
+ㄟ
+ㄢ
+ㄤ
+ㄥ
+ㄧ
+ㄨ
+ㆍ
+㈦
+㊣
+㎡
+㗎
+一
+丁
+七
+万
+丈
+三
+上
+下
+不
+与
+丐
+丑
+专
+且
+丕
+世
+丘
+丙
+业
+丛
+东
+丝
+丞
+丟
+両
+丢
+两
+严
+並
+丧
+丨
+个
+丫
+中
+丰
+串
+临
+丶
+丸
+丹
+为
+主
+丼
+丽
+举
+丿
+乂
+乃
+久
+么
+义
+之
+乌
+乍
+乎
+乏
+乐
+乒
+乓
+乔
+乖
+乗
+乘
+乙
+乜
+九
+乞
+也
+习
+乡
+书
+乩
+买
+乱
+乳
+乾
+亀
+亂
+了
+予
+争
+事
+二
+于
+亏
+云
+互
+五
+井
+亘
+亙
+亚
+些
+亜
+亞
+亟
+亡
+亢
+交
+亥
+亦
+产
+亨
+亩
+享
+京
+亭
+亮
+亲
+亳
+亵
+人
+亿
+什
+仁
+仃
+仄
+仅
+仆
+仇
+今
+介
+仍
+从
+仏
+仑
+仓
+仔
+仕
+他
+仗
+付
+仙
+仝
+仞
+仟
+代
+令
+以
+仨
+仪
+们
+仮
+仰
+仲
+件
+价
+任
+份
+仿
+企
+伉
+伊
+伍
+伎
+伏
+伐
+休
+伕
+众
+优
+伙
+会
+伝
+伞
+伟
+传
+伢
+伤
+伦
+伪
+伫
+伯
+估
+伴
+伶
+伸
+伺
+似
+伽
+佃
+但
+佇
+佈
+位
+低
+住
+佐
+佑
+体
+佔
+何
+佗
+佘
+余
+佚
+佛
+作
+佝
+佞
+佟
+你
+佢
+佣
+佤
+佥
+佩
+佬
+佯
+佰
+佳
+併
+佶
+佻
+佼
+使
+侃
+侄
+來
+侈
+例
+侍
+侏
+侑
+侖
+侗
+供
+依
+侠
+価
+侣
+侥
+侦
+侧
+侨
+侬
+侮
+侯
+侵
+侶
+侷
+便
+係
+促
+俄
+俊
+俎
+俏
+俐
+俑
+俗
+俘
+俚
+保
+俞
+俟
+俠
+信
+俨
+俩
+俪
+俬
+俭
+修
+俯
+俱
+俳
+俸
+俺
+俾
+倆
+倉
+個
+倌
+倍
+倏
+們
+倒
+倔
+倖
+倘
+候
+倚
+倜
+借
+倡
+値
+倦
+倩
+倪
+倫
+倬
+倭
+倶
+债
+值
+倾
+偃
+假
+偈
+偉
+偌
+偎
+偏
+偕
+做
+停
+健
+側
+偵
+偶
+偷
+偻
+偽
+偿
+傀
+傅
+傍
+傑
+傘
+備
+傚
+傢
+傣
+傥
+储
+傩
+催
+傭
+傲
+傳
+債
+傷
+傻
+傾
+僅
+働
+像
+僑
+僕
+僖
+僚
+僥
+僧
+僭
+僮
+僱
+僵
+價
+僻
+儀
+儂
+億
+儆
+儉
+儋
+儒
+儕
+儘
+償
+儡
+優
+儲
+儷
+儼
+儿
+兀
+允
+元
+兄
+充
+兆
+兇
+先
+光
+克
+兌
+免
+児
+兑
+兒
+兔
+兖
+党
+兜
+兢
+入
+內
+全
+兩
+八
+公
+六
+兮
+兰
+共
+兲
+关
+兴
+兵
+其
+具
+典
+兹
+养
+兼
+兽
+冀
+内
+円
+冇
+冈
+冉
+冊
+册
+再
+冏
+冒
+冕
+冗
+写
+军
+农
+冠
+冢
+冤
+冥
+冨
+冪
+冬
+冯
+冰
+冲
+决
+况
+冶
+冷
+冻
+冼
+冽
+冾
+净
+凄
+准
+凇
+凈
+凉
+凋
+凌
+凍
+减
+凑
+凛
+凜
+凝
+几
+凡
+凤
+処
+凪
+凭
+凯
+凰
+凱
+凳
+凶
+凸
+凹
+出
+击
+函
+凿
+刀
+刁
+刃
+分
+切
+刈
+刊
+刍
+刎
+刑
+划
+列
+刘
+则
+刚
+创
+初
+删
+判
+別
+刨
+利
+刪
+别
+刮
+到
+制
+刷
+券
+刹
+刺
+刻
+刽
+剁
+剂
+剃
+則
+剉
+削
+剋
+剌
+前
+剎
+剐
+剑
+剔
+剖
+剛
+剜
+剝
+剣
+剤
+剥
+剧
+剩
+剪
+副
+割
+創
+剷
+剽
+剿
+劃
+劇
+劈
+劉
+劊
+劍
+劏
+劑
+力
+劝
+办
+功
+加
+务
+劣
+动
+助
+努
+劫
+劭
+励
+劲
+劳
+労
+劵
+効
+劾
+势
+勁
+勃
+勇
+勉
+勋
+勐
+勒
+動
+勖
+勘
+務
+勛
+勝
+勞
+募
+勢
+勤
+勧
+勳
+勵
+勸
+勺
+勻
+勾
+勿
+匀
+包
+匆
+匈
+匍
+匐
+匕
+化
+北
+匙
+匝
+匠
+匡
+匣
+匪
+匮
+匯
+匱
+匹
+区
+医
+匾
+匿
+區
+十
+千
+卅
+升
+午
+卉
+半
+卍
+华
+协
+卑
+卒
+卓
+協
+单
+卖
+南
+単
+博
+卜
+卞
+卟
+占
+卡
+卢
+卤
+卦
+卧
+卫
+卮
+卯
+印
+危
+即
+却
+卵
+卷
+卸
+卻
+卿
+厂
+厄
+厅
+历
+厉
+压
+厌
+厕
+厘
+厚
+厝
+原
+厢
+厥
+厦
+厨
+厩
+厭
+厮
+厲
+厳
+去
+县
+叁
+参
+參
+又
+叉
+及
+友
+双
+反
+収
+发
+叔
+取
+受
+变
+叙
+叛
+叟
+叠
+叡
+叢
+口
+古
+句
+另
+叨
+叩
+只
+叫
+召
+叭
+叮
+可
+台
+叱
+史
+右
+叵
+叶
+号
+司
+叹
+叻
+叼
+叽
+吁
+吃
+各
+吆
+合
+吉
+吊
+吋
+同
+名
+后
+吏
+吐
+向
+吒
+吓
+吕
+吖
+吗
+君
+吝
+吞
+吟
+吠
+吡
+否
+吧
+吨
+吩
+含
+听
+吭
+吮
+启
+吱
+吳
+吴
+吵
+吶
+吸
+吹
+吻
+吼
+吽
+吾
+呀
+呂
+呃
+呆
+呈
+告
+呋
+呎
+呐
+呓
+呕
+呗
+员
+呛
+呜
+呢
+呤
+呦
+周
+呱
+呲
+味
+呵
+呷
+呸
+呻
+呼
+命
+咀
+咁
+咂
+咄
+咆
+咋
+和
+咎
+咏
+咐
+咒
+咔
+咕
+咖
+咗
+咘
+咙
+咚
+咛
+咣
+咤
+咦
+咧
+咨
+咩
+咪
+咫
+咬
+咭
+咯
+咱
+咲
+咳
+咸
+咻
+咽
+咿
+哀
+品
+哂
+哄
+哆
+哇
+哈
+哉
+哋
+哌
+响
+哎
+哏
+哐
+哑
+哒
+哔
+哗
+哟
+員
+哥
+哦
+哧
+哨
+哩
+哪
+哭
+哮
+哲
+哺
+哼
+哽
+唁
+唄
+唆
+唇
+唉
+唏
+唐
+唑
+唔
+唠
+唤
+唧
+唬
+售
+唯
+唰
+唱
+唳
+唷
+唸
+唾
+啃
+啄
+商
+啉
+啊
+問
+啓
+啕
+啖
+啜
+啞
+啟
+啡
+啤
+啥
+啦
+啧
+啪
+啫
+啬
+啮
+啰
+啱
+啲
+啵
+啶
+啷
+啸
+啻
+啼
+啾
+喀
+喂
+喃
+善
+喆
+喇
+喉
+喊
+喋
+喎
+喏
+喔
+喘
+喙
+喚
+喜
+喝
+喟
+喧
+喪
+喫
+喬
+單
+喰
+喱
+喲
+喳
+喵
+営
+喷
+喹
+喺
+喻
+喽
+嗅
+嗆
+嗇
+嗎
+嗑
+嗒
+嗓
+嗔
+嗖
+嗚
+嗜
+嗝
+嗟
+嗡
+嗣
+嗤
+嗦
+嗨
+嗪
+嗬
+嗯
+嗰
+嗲
+嗳
+嗶
+嗷
+嗽
+嘀
+嘅
+嘆
+嘈
+嘉
+嘌
+嘍
+嘎
+嘔
+嘖
+嘗
+嘘
+嘚
+嘛
+嘜
+嘞
+嘟
+嘢
+嘣
+嘤
+嘧
+嘩
+嘭
+嘮
+嘯
+嘰
+嘱
+嘲
+嘴
+嘶
+嘸
+嘹
+嘻
+嘿
+噁
+噌
+噎
+噓
+噔
+噗
+噙
+噜
+噠
+噢
+噤
+器
+噩
+噪
+噬
+噱
+噴
+噶
+噸
+噹
+噻
+噼
+嚀
+嚇
+嚎
+嚏
+嚐
+嚓
+嚕
+嚟
+嚣
+嚥
+嚨
+嚮
+嚴
+嚷
+嚼
+囂
+囉
+囊
+囍
+囑
+囔
+囗
+囚
+四
+囝
+回
+囟
+因
+囡
+团
+団
+囤
+囧
+囪
+囫
+园
+困
+囱
+囲
+図
+围
+囹
+固
+国
+图
+囿
+圃
+圄
+圆
+圈
+國
+圍
+圏
+園
+圓
+圖
+團
+圜
+土
+圣
+圧
+在
+圩
+圭
+地
+圳
+场
+圻
+圾
+址
+坂
+均
+坊
+坍
+坎
+坏
+坐
+坑
+块
+坚
+坛
+坝
+坞
+坟
+坠
+坡
+坤
+坦
+坨
+坪
+坯
+坳
+坵
+坷
+垂
+垃
+垄
+型
+垒
+垚
+垛
+垠
+垢
+垣
+垦
+垩
+垫
+垭
+垮
+垵
+埂
+埃
+埋
+城
+埔
+埕
+埗
+域
+埠
+埤
+埵
+執
+埸
+培
+基
+埼
+堀
+堂
+堃
+堅
+堆
+堇
+堑
+堕
+堙
+堡
+堤
+堪
+堯
+堰
+報
+場
+堵
+堺
+堿
+塊
+塌
+塑
+塔
+塗
+塘
+塚
+塞
+塢
+塩
+填
+塬
+塭
+塵
+塾
+墀
+境
+墅
+墉
+墊
+墒
+墓
+増
+墘
+墙
+墜
+增
+墟
+墨
+墩
+墮
+墳
+墻
+墾
+壁
+壅
+壆
+壇
+壊
+壑
+壓
+壕
+壘
+壞
+壟
+壢
+壤
+壩
+士
+壬
+壮
+壯
+声
+売
+壳
+壶
+壹
+壺
+壽
+处
+备
+変
+复
+夏
+夔
+夕
+外
+夙
+多
+夜
+够
+夠
+夢
+夥
+大
+天
+太
+夫
+夭
+央
+夯
+失
+头
+夷
+夸
+夹
+夺
+夾
+奂
+奄
+奇
+奈
+奉
+奋
+奎
+奏
+奐
+契
+奔
+奕
+奖
+套
+奘
+奚
+奠
+奢
+奥
+奧
+奪
+奬
+奮
+女
+奴
+奶
+奸
+她
+好
+如
+妃
+妄
+妆
+妇
+妈
+妊
+妍
+妒
+妓
+妖
+妘
+妙
+妝
+妞
+妣
+妤
+妥
+妨
+妩
+妪
+妮
+妲
+妳
+妹
+妻
+妾
+姆
+姉
+姊
+始
+姍
+姐
+姑
+姒
+姓
+委
+姗
+姚
+姜
+姝
+姣
+姥
+姦
+姨
+姪
+姫
+姬
+姹
+姻
+姿
+威
+娃
+娄
+娅
+娆
+娇
+娉
+娑
+娓
+娘
+娛
+娜
+娟
+娠
+娣
+娥
+娩
+娱
+娲
+娴
+娶
+娼
+婀
+婁
+婆
+婉
+婊
+婕
+婚
+婢
+婦
+婧
+婪
+婭
+婴
+婵
+婶
+婷
+婺
+婿
+媒
+媚
+媛
+媞
+媧
+媲
+媳
+媽
+媾
+嫁
+嫂
+嫉
+嫌
+嫑
+嫔
+嫖
+嫘
+嫚
+嫡
+嫣
+嫦
+嫩
+嫲
+嫵
+嫻
+嬅
+嬉
+嬌
+嬗
+嬛
+嬢
+嬤
+嬪
+嬰
+嬴
+嬷
+嬸
+嬿
+孀
+孃
+子
+孑
+孔
+孕
+孖
+字
+存
+孙
+孚
+孛
+孜
+孝
+孟
+孢
+季
+孤
+学
+孩
+孪
+孫
+孬
+孰
+孱
+孳
+孵
+學
+孺
+孽
+孿
+宁
+它
+宅
+宇
+守
+安
+宋
+完
+宏
+宓
+宕
+宗
+官
+宙
+定
+宛
+宜
+宝
+实
+実
+宠
+审
+客
+宣
+室
+宥
+宦
+宪
+宫
+宮
+宰
+害
+宴
+宵
+家
+宸
+容
+宽
+宾
+宿
+寂
+寄
+寅
+密
+寇
+富
+寐
+寒
+寓
+寛
+寝
+寞
+察
+寡
+寢
+寥
+實
+寧
+寨
+審
+寫
+寬
+寮
+寰
+寵
+寶
+寸
+对
+寺
+寻
+导
+対
+寿
+封
+専
+射
+将
+將
+專
+尉
+尊
+尋
+對
+導
+小
+少
+尔
+尕
+尖
+尘
+尚
+尝
+尤
+尧
+尬
+就
+尴
+尷
+尸
+尹
+尺
+尻
+尼
+尽
+尾
+尿
+局
+屁
+层
+屄
+居
+屆
+屈
+屉
+届
+屋
+屌
+屍
+屎
+屏
+屐
+屑
+展
+屜
+属
+屠
+屡
+屢
+層
+履
+屬
+屯
+山
+屹
+屿
+岀
+岁
+岂
+岌
+岐
+岑
+岔
+岖
+岗
+岘
+岙
+岚
+岛
+岡
+岩
+岫
+岬
+岭
+岱
+岳
+岷
+岸
+峇
+峋
+峒
+峙
+峡
+峤
+峥
+峦
+峨
+峪
+峭
+峯
+峰
+峴
+島
+峻
+峽
+崁
+崂
+崆
+崇
+崎
+崑
+崔
+崖
+崗
+崙
+崛
+崧
+崩
+崭
+崴
+崽
+嵇
+嵊
+嵋
+嵌
+嵐
+嵘
+嵩
+嵬
+嵯
+嶂
+嶄
+嶇
+嶋
+嶙
+嶺
+嶼
+嶽
+巅
+巍
+巒
+巔
+巖
+川
+州
+巡
+巢
+工
+左
+巧
+巨
+巩
+巫
+差
+己
+已
+巳
+巴
+巷
+巻
+巽
+巾
+巿
+币
+市
+布
+帅
+帆
+师
+希
+帐
+帑
+帕
+帖
+帘
+帚
+帛
+帜
+帝
+帥
+带
+帧
+師
+席
+帮
+帯
+帰
+帳
+帶
+帷
+常
+帼
+帽
+幀
+幂
+幄
+幅
+幌
+幔
+幕
+幟
+幡
+幢
+幣
+幫
+干
+平
+年
+并
+幸
+幹
+幺
+幻
+幼
+幽
+幾
+广
+庁
+広
+庄
+庆
+庇
+床
+序
+庐
+库
+应
+底
+庖
+店
+庙
+庚
+府
+庞
+废
+庠
+度
+座
+庫
+庭
+庵
+庶
+康
+庸
+庹
+庾
+廁
+廂
+廃
+廈
+廉
+廊
+廓
+廖
+廚
+廝
+廟
+廠
+廢
+廣
+廬
+廳
+延
+廷
+建
+廿
+开
+弁
+异
+弃
+弄
+弈
+弊
+弋
+式
+弑
+弒
+弓
+弔
+引
+弗
+弘
+弛
+弟
+张
+弥
+弦
+弧
+弩
+弭
+弯
+弱
+張
+強
+弹
+强
+弼
+弾
+彅
+彆
+彈
+彌
+彎
+归
+当
+录
+彗
+彙
+彝
+形
+彤
+彥
+彦
+彧
+彩
+彪
+彫
+彬
+彭
+彰
+影
+彷
+役
+彻
+彼
+彿
+往
+征
+径
+待
+徇
+很
+徉
+徊
+律
+後
+徐
+徑
+徒
+従
+徕
+得
+徘
+徙
+徜
+從
+徠
+御
+徨
+復
+循
+徬
+微
+徳
+徴
+徵
+德
+徹
+徼
+徽
+心
+必
+忆
+忌
+忍
+忏
+忐
+忑
+忒
+忖
+志
+忘
+忙
+応
+忠
+忡
+忤
+忧
+忪
+快
+忱
+念
+忻
+忽
+忿
+怀
+态
+怂
+怅
+怆
+怎
+怏
+怒
+怔
+怕
+怖
+怙
+怜
+思
+怠
+怡
+急
+怦
+性
+怨
+怪
+怯
+怵
+总
+怼
+恁
+恃
+恆
+恋
+恍
+恐
+恒
+恕
+恙
+恚
+恢
+恣
+恤
+恥
+恨
+恩
+恪
+恫
+恬
+恭
+息
+恰
+恳
+恵
+恶
+恸
+恺
+恻
+恼
+恿
+悄
+悅
+悉
+悌
+悍
+悔
+悖
+悚
+悟
+悠
+患
+悦
+您
+悩
+悪
+悬
+悯
+悱
+悲
+悴
+悵
+悶
+悸
+悻
+悼
+悽
+情
+惆
+惇
+惊
+惋
+惑
+惕
+惘
+惚
+惜
+惟
+惠
+惡
+惦
+惧
+惨
+惩
+惫
+惬
+惭
+惮
+惯
+惰
+惱
+想
+惴
+惶
+惹
+惺
+愁
+愆
+愈
+愉
+愍
+意
+愕
+愚
+愛
+愜
+感
+愣
+愤
+愧
+愫
+愷
+愿
+慄
+慈
+態
+慌
+慎
+慑
+慕
+慘
+慚
+慟
+慢
+慣
+慧
+慨
+慫
+慮
+慰
+慳
+慵
+慶
+慷
+慾
+憂
+憊
+憋
+憎
+憐
+憑
+憔
+憚
+憤
+憧
+憨
+憩
+憫
+憬
+憲
+憶
+憾
+懂
+懇
+懈
+應
+懊
+懋
+懑
+懒
+懦
+懲
+懵
+懶
+懷
+懸
+懺
+懼
+懾
+懿
+戀
+戈
+戊
+戌
+戍
+戎
+戏
+成
+我
+戒
+戕
+或
+战
+戚
+戛
+戟
+戡
+戦
+截
+戬
+戮
+戰
+戲
+戳
+戴
+戶
+户
+戸
+戻
+戾
+房
+所
+扁
+扇
+扈
+扉
+手
+才
+扎
+扑
+扒
+打
+扔
+払
+托
+扛
+扣
+扦
+执
+扩
+扪
+扫
+扬
+扭
+扮
+扯
+扰
+扱
+扳
+扶
+批
+扼
+找
+承
+技
+抄
+抉
+把
+抑
+抒
+抓
+投
+抖
+抗
+折
+抚
+抛
+抜
+択
+抟
+抠
+抡
+抢
+护
+报
+抨
+披
+抬
+抱
+抵
+抹
+押
+抽
+抿
+拂
+拄
+担
+拆
+拇
+拈
+拉
+拋
+拌
+拍
+拎
+拐
+拒
+拓
+拔
+拖
+拗
+拘
+拙
+拚
+招
+拜
+拟
+拡
+拢
+拣
+拥
+拦
+拧
+拨
+择
+括
+拭
+拮
+拯
+拱
+拳
+拴
+拷
+拼
+拽
+拾
+拿
+持
+挂
+指
+挈
+按
+挎
+挑
+挖
+挙
+挚
+挛
+挝
+挞
+挟
+挠
+挡
+挣
+挤
+挥
+挨
+挪
+挫
+振
+挲
+挹
+挺
+挽
+挾
+捂
+捅
+捆
+捉
+捋
+捌
+捍
+捎
+捏
+捐
+捕
+捞
+损
+捡
+换
+捣
+捧
+捨
+捩
+据
+捱
+捲
+捶
+捷
+捺
+捻
+掀
+掂
+掃
+掇
+授
+掉
+掌
+掏
+掐
+排
+掖
+掘
+掙
+掛
+掠
+採
+探
+掣
+接
+控
+推
+掩
+措
+掬
+掰
+掲
+掳
+掴
+掷
+掸
+掺
+揀
+揃
+揄
+揆
+揉
+揍
+描
+提
+插
+揖
+揚
+換
+握
+揣
+揩
+揪
+揭
+揮
+援
+揶
+揸
+揹
+揽
+搀
+搁
+搂
+搅
+損
+搏
+搐
+搓
+搔
+搖
+搗
+搜
+搞
+搡
+搪
+搬
+搭
+搵
+搶
+携
+搽
+摀
+摁
+摄
+摆
+摇
+摈
+摊
+摒
+摔
+摘
+摞
+摟
+摧
+摩
+摯
+摳
+摸
+摹
+摺
+摻
+撂
+撃
+撅
+撇
+撈
+撐
+撑
+撒
+撓
+撕
+撚
+撞
+撤
+撥
+撩
+撫
+撬
+播
+撮
+撰
+撲
+撵
+撷
+撸
+撻
+撼
+撿
+擀
+擁
+擂
+擄
+擅
+擇
+擊
+擋
+操
+擎
+擒
+擔
+擘
+據
+擞
+擠
+擡
+擢
+擦
+擬
+擰
+擱
+擲
+擴
+擷
+擺
+擼
+擾
+攀
+攏
+攒
+攔
+攘
+攙
+攜
+攝
+攞
+攢
+攣
+攤
+攥
+攪
+攫
+攬
+支
+收
+攸
+改
+攻
+放
+政
+故
+效
+敌
+敍
+敎
+敏
+救
+敕
+敖
+敗
+敘
+教
+敛
+敝
+敞
+敢
+散
+敦
+敬
+数
+敲
+整
+敵
+敷
+數
+斂
+斃
+文
+斋
+斌
+斎
+斐
+斑
+斓
+斗
+料
+斛
+斜
+斟
+斡
+斤
+斥
+斧
+斩
+斫
+斬
+断
+斯
+新
+斷
+方
+於
+施
+旁
+旃
+旅
+旋
+旌
+旎
+族
+旖
+旗
+无
+既
+日
+旦
+旧
+旨
+早
+旬
+旭
+旮
+旱
+时
+旷
+旺
+旻
+昀
+昂
+昆
+昇
+昉
+昊
+昌
+明
+昏
+易
+昔
+昕
+昙
+星
+映
+春
+昧
+昨
+昭
+是
+昱
+昴
+昵
+昶
+昼
+显
+晁
+時
+晃
+晉
+晋
+晌
+晏
+晒
+晓
+晔
+晕
+晖
+晗
+晚
+晝
+晞
+晟
+晤
+晦
+晨
+晩
+普
+景
+晰
+晴
+晶
+晷
+智
+晾
+暂
+暄
+暇
+暈
+暉
+暌
+暐
+暑
+暖
+暗
+暝
+暢
+暧
+暨
+暫
+暮
+暱
+暴
+暸
+暹
+曄
+曆
+曇
+曉
+曖
+曙
+曜
+曝
+曠
+曦
+曬
+曰
+曲
+曳
+更
+書
+曹
+曼
+曾
+替
+最
+會
+月
+有
+朋
+服
+朐
+朔
+朕
+朗
+望
+朝
+期
+朦
+朧
+木
+未
+末
+本
+札
+朮
+术
+朱
+朴
+朵
+机
+朽
+杀
+杂
+权
+杆
+杈
+杉
+李
+杏
+材
+村
+杓
+杖
+杜
+杞
+束
+杠
+条
+来
+杨
+杭
+杯
+杰
+東
+杳
+杵
+杷
+杼
+松
+板
+极
+构
+枇
+枉
+枋
+析
+枕
+林
+枚
+果
+枝
+枢
+枣
+枪
+枫
+枭
+枯
+枰
+枱
+枳
+架
+枷
+枸
+柄
+柏
+某
+柑
+柒
+染
+柔
+柘
+柚
+柜
+柞
+柠
+柢
+查
+柩
+柬
+柯
+柱
+柳
+柴
+柵
+査
+柿
+栀
+栃
+栄
+栅
+标
+栈
+栉
+栋
+栎
+栏
+树
+栓
+栖
+栗
+校
+栩
+株
+样
+核
+根
+格
+栽
+栾
+桀
+桁
+桂
+桃
+桅
+框
+案
+桉
+桌
+桎
+桐
+桑
+桓
+桔
+桜
+桠
+桡
+桢
+档
+桥
+桦
+桧
+桨
+桩
+桶
+桿
+梁
+梅
+梆
+梏
+梓
+梗
+條
+梟
+梢
+梦
+梧
+梨
+梭
+梯
+械
+梳
+梵
+梶
+检
+棂
+棄
+棉
+棋
+棍
+棒
+棕
+棗
+棘
+棚
+棟
+棠
+棣
+棧
+森
+棱
+棲
+棵
+棹
+棺
+椁
+椅
+椋
+植
+椎
+椒
+検
+椪
+椭
+椰
+椹
+椽
+椿
+楂
+楊
+楓
+楔
+楚
+楝
+楞
+楠
+楣
+楨
+楫
+業
+楮
+極
+楷
+楸
+楹
+楼
+楽
+概
+榄
+榆
+榈
+榉
+榔
+榕
+榖
+榛
+榜
+榨
+榫
+榭
+榮
+榱
+榴
+榷
+榻
+槁
+槃
+構
+槌
+槍
+槎
+槐
+槓
+様
+槛
+槟
+槤
+槭
+槲
+槳
+槻
+槽
+槿
+樁
+樂
+樊
+樑
+樓
+標
+樞
+樟
+模
+樣
+権
+横
+樫
+樯
+樱
+樵
+樸
+樹
+樺
+樽
+樾
+橄
+橇
+橋
+橐
+橘
+橙
+機
+橡
+橢
+橫
+橱
+橹
+橼
+檀
+檄
+檎
+檐
+檔
+檗
+檜
+檢
+檬
+檯
+檳
+檸
+檻
+櫃
+櫚
+櫛
+櫥
+櫸
+櫻
+欄
+權
+欒
+欖
+欠
+次
+欢
+欣
+欧
+欲
+欸
+欺
+欽
+款
+歆
+歇
+歉
+歌
+歎
+歐
+歓
+歙
+歛
+歡
+止
+正
+此
+步
+武
+歧
+歩
+歪
+歯
+歲
+歳
+歴
+歷
+歸
+歹
+死
+歼
+殁
+殃
+殆
+殇
+殉
+殊
+残
+殒
+殓
+殖
+殘
+殞
+殡
+殤
+殭
+殯
+殲
+殴
+段
+殷
+殺
+殼
+殿
+毀
+毁
+毂
+毅
+毆
+毋
+母
+毎
+每
+毒
+毓
+比
+毕
+毗
+毘
+毙
+毛
+毡
+毫
+毯
+毽
+氈
+氏
+氐
+民
+氓
+气
+氖
+気
+氙
+氛
+氟
+氡
+氢
+氣
+氤
+氦
+氧
+氨
+氪
+氫
+氮
+氯
+氰
+氲
+水
+氷
+永
+氹
+氾
+汀
+汁
+求
+汆
+汇
+汉
+汎
+汐
+汕
+汗
+汙
+汛
+汝
+汞
+江
+池
+污
+汤
+汨
+汩
+汪
+汰
+汲
+汴
+汶
+汹
+決
+汽
+汾
+沁
+沂
+沃
+沅
+沈
+沉
+沌
+沏
+沐
+沒
+沓
+沖
+沙
+沛
+沟
+没
+沢
+沣
+沥
+沦
+沧
+沪
+沫
+沭
+沮
+沱
+河
+沸
+油
+治
+沼
+沽
+沾
+沿
+況
+泄
+泉
+泊
+泌
+泓
+法
+泗
+泛
+泞
+泠
+泡
+波
+泣
+泥
+注
+泪
+泫
+泮
+泯
+泰
+泱
+泳
+泵
+泷
+泸
+泻
+泼
+泽
+泾
+洁
+洄
+洋
+洒
+洗
+洙
+洛
+洞
+津
+洩
+洪
+洮
+洱
+洲
+洵
+洶
+洸
+洹
+活
+洼
+洽
+派
+流
+浃
+浄
+浅
+浆
+浇
+浊
+测
+济
+浏
+浑
+浒
+浓
+浔
+浙
+浚
+浜
+浣
+浦
+浩
+浪
+浬
+浮
+浯
+浴
+海
+浸
+涂
+涅
+涇
+消
+涉
+涌
+涎
+涓
+涔
+涕
+涙
+涛
+涝
+涞
+涟
+涠
+涡
+涣
+涤
+润
+涧
+涨
+涩
+涪
+涮
+涯
+液
+涵
+涸
+涼
+涿
+淀
+淄
+淅
+淆
+淇
+淋
+淌
+淑
+淒
+淖
+淘
+淙
+淚
+淞
+淡
+淤
+淦
+淨
+淩
+淪
+淫
+淬
+淮
+深
+淳
+淵
+混
+淹
+淺
+添
+淼
+清
+済
+渉
+渊
+渋
+渍
+渎
+渐
+渔
+渗
+渙
+渚
+減
+渝
+渠
+渡
+渣
+渤
+渥
+渦
+温
+測
+渭
+港
+渲
+渴
+游
+渺
+渾
+湃
+湄
+湊
+湍
+湖
+湘
+湛
+湟
+湧
+湫
+湮
+湯
+湳
+湾
+湿
+満
+溃
+溅
+溉
+溏
+源
+準
+溜
+溝
+溟
+溢
+溥
+溧
+溪
+溫
+溯
+溱
+溴
+溶
+溺
+溼
+滁
+滂
+滄
+滅
+滇
+滋
+滌
+滑
+滓
+滔
+滕
+滙
+滚
+滝
+滞
+滟
+满
+滢
+滤
+滥
+滦
+滨
+滩
+滬
+滯
+滲
+滴
+滷
+滸
+滾
+滿
+漁
+漂
+漆
+漉
+漏
+漓
+演
+漕
+漠
+漢
+漣
+漩
+漪
+漫
+漬
+漯
+漱
+漲
+漳
+漸
+漾
+漿
+潆
+潇
+潋
+潍
+潑
+潔
+潘
+潛
+潜
+潞
+潟
+潢
+潤
+潦
+潧
+潭
+潮
+潰
+潴
+潸
+潺
+潼
+澀
+澄
+澆
+澈
+澍
+澎
+澗
+澜
+澡
+澤
+澧
+澱
+澳
+澹
+激
+濁
+濂
+濃
+濑
+濒
+濕
+濘
+濛
+濟
+濠
+濡
+濤
+濫
+濬
+濮
+濯
+濱
+濺
+濾
+瀅
+瀆
+瀉
+瀋
+瀏
+瀑
+瀕
+瀘
+瀚
+瀛
+瀝
+瀞
+瀟
+瀧
+瀨
+瀬
+瀰
+瀾
+灌
+灏
+灑
+灘
+灝
+灞
+灣
+火
+灬
+灭
+灯
+灰
+灵
+灶
+灸
+灼
+災
+灾
+灿
+炀
+炁
+炅
+炉
+炊
+炎
+炒
+炔
+炕
+炖
+炙
+炜
+炫
+炬
+炭
+炮
+炯
+炳
+炷
+炸
+点
+為
+炼
+炽
+烁
+烂
+烃
+烈
+烊
+烏
+烘
+烙
+烛
+烟
+烤
+烦
+烧
+烨
+烩
+烫
+烬
+热
+烯
+烷
+烹
+烽
+焉
+焊
+焕
+焖
+焗
+焘
+焙
+焚
+焜
+無
+焦
+焯
+焰
+焱
+然
+焼
+煅
+煉
+煊
+煌
+煎
+煒
+煖
+煙
+煜
+煞
+煤
+煥
+煦
+照
+煨
+煩
+煮
+煲
+煸
+煽
+熄
+熊
+熏
+熒
+熔
+熙
+熟
+熠
+熨
+熬
+熱
+熵
+熹
+熾
+燁
+燃
+燄
+燈
+燉
+燊
+燎
+燒
+燔
+燕
+燙
+燜
+營
+燥
+燦
+燧
+燭
+燮
+燴
+燻
+燼
+燿
+爆
+爍
+爐
+爛
+爪
+爬
+爭
+爰
+爱
+爲
+爵
+父
+爷
+爸
+爹
+爺
+爻
+爽
+爾
+牆
+片
+版
+牌
+牍
+牒
+牙
+牛
+牝
+牟
+牠
+牡
+牢
+牦
+牧
+物
+牯
+牲
+牴
+牵
+特
+牺
+牽
+犀
+犁
+犄
+犊
+犍
+犒
+犢
+犧
+犬
+犯
+状
+犷
+犸
+犹
+狀
+狂
+狄
+狈
+狎
+狐
+狒
+狗
+狙
+狞
+狠
+狡
+狩
+独
+狭
+狮
+狰
+狱
+狸
+狹
+狼
+狽
+猎
+猕
+猖
+猗
+猙
+猛
+猜
+猝
+猥
+猩
+猪
+猫
+猬
+献
+猴
+猶
+猷
+猾
+猿
+獄
+獅
+獎
+獐
+獒
+獗
+獠
+獣
+獨
+獭
+獰
+獲
+獵
+獷
+獸
+獺
+獻
+獼
+獾
+玄
+率
+玉
+王
+玑
+玖
+玛
+玟
+玠
+玥
+玩
+玫
+玮
+环
+现
+玲
+玳
+玷
+玺
+玻
+珀
+珂
+珅
+珈
+珉
+珊
+珍
+珏
+珐
+珑
+珙
+珞
+珠
+珣
+珥
+珩
+珪
+班
+珮
+珲
+珺
+現
+球
+琅
+理
+琇
+琉
+琊
+琍
+琏
+琐
+琛
+琢
+琥
+琦
+琨
+琪
+琬
+琮
+琰
+琲
+琳
+琴
+琵
+琶
+琺
+琼
+瑀
+瑁
+瑄
+瑋
+瑕
+瑗
+瑙
+瑚
+瑛
+瑜
+瑞
+瑟
+瑠
+瑣
+瑤
+瑩
+瑪
+瑯
+瑰
+瑶
+瑾
+璀
+璁
+璃
+璇
+璉
+璋
+璎
+璐
+璜
+璞
+璟
+璧
+璨
+環
+璽
+璿
+瓊
+瓏
+瓒
+瓜
+瓢
+瓣
+瓤
+瓦
+瓮
+瓯
+瓴
+瓶
+瓷
+甄
+甌
+甕
+甘
+甙
+甚
+甜
+生
+產
+産
+甥
+甦
+用
+甩
+甫
+甬
+甭
+甯
+田
+由
+甲
+申
+电
+男
+甸
+町
+画
+甾
+畀
+畅
+界
+畏
+畑
+畔
+留
+畜
+畝
+畢
+略
+畦
+番
+畫
+異
+畲
+畳
+畴
+當
+畸
+畹
+畿
+疆
+疇
+疊
+疏
+疑
+疔
+疖
+疗
+疙
+疚
+疝
+疟
+疡
+疣
+疤
+疥
+疫
+疮
+疯
+疱
+疲
+疳
+疵
+疸
+疹
+疼
+疽
+疾
+痂
+病
+症
+痈
+痉
+痊
+痍
+痒
+痔
+痕
+痘
+痙
+痛
+痞
+痠
+痢
+痣
+痤
+痧
+痨
+痪
+痫
+痰
+痱
+痴
+痹
+痺
+痼
+痿
+瘀
+瘁
+瘋
+瘍
+瘓
+瘘
+瘙
+瘟
+瘠
+瘡
+瘢
+瘤
+瘦
+瘧
+瘩
+瘪
+瘫
+瘴
+瘸
+瘾
+療
+癇
+癌
+癒
+癖
+癜
+癞
+癡
+癢
+癣
+癥
+癫
+癬
+癮
+癱
+癲
+癸
+発
+登
+發
+白
+百
+皂
+的
+皆
+皇
+皈
+皋
+皎
+皑
+皓
+皖
+皙
+皚
+皮
+皰
+皱
+皴
+皺
+皿
+盂
+盃
+盅
+盆
+盈
+益
+盎
+盏
+盐
+监
+盒
+盔
+盖
+盗
+盘
+盛
+盜
+盞
+盟
+盡
+監
+盤
+盥
+盧
+盪
+目
+盯
+盱
+盲
+直
+相
+盹
+盼
+盾
+省
+眈
+眉
+看
+県
+眙
+眞
+真
+眠
+眦
+眨
+眩
+眯
+眶
+眷
+眸
+眺
+眼
+眾
+着
+睁
+睇
+睏
+睐
+睑
+睛
+睜
+睞
+睡
+睢
+督
+睥
+睦
+睨
+睪
+睫
+睬
+睹
+睽
+睾
+睿
+瞄
+瞅
+瞇
+瞋
+瞌
+瞎
+瞑
+瞒
+瞓
+瞞
+瞟
+瞠
+瞥
+瞧
+瞩
+瞪
+瞬
+瞭
+瞰
+瞳
+瞻
+瞼
+瞿
+矇
+矍
+矗
+矚
+矛
+矜
+矢
+矣
+知
+矩
+矫
+短
+矮
+矯
+石
+矶
+矽
+矾
+矿
+码
+砂
+砌
+砍
+砒
+研
+砖
+砗
+砚
+砝
+砣
+砥
+砧
+砭
+砰
+砲
+破
+砷
+砸
+砺
+砼
+砾
+础
+硅
+硐
+硒
+硕
+硝
+硫
+硬
+确
+硯
+硼
+碁
+碇
+碉
+碌
+碍
+碎
+碑
+碓
+碗
+碘
+碚
+碛
+碟
+碣
+碧
+碩
+碰
+碱
+碳
+碴
+確
+碼
+碾
+磁
+磅
+磊
+磋
+磐
+磕
+磚
+磡
+磨
+磬
+磯
+磲
+磷
+磺
+礁
+礎
+礙
+礡
+礦
+礪
+礫
+礴
+示
+礼
+社
+祀
+祁
+祂
+祇
+祈
+祉
+祎
+祐
+祕
+祖
+祗
+祚
+祛
+祜
+祝
+神
+祟
+祠
+祢
+祥
+票
+祭
+祯
+祷
+祸
+祺
+祿
+禀
+禁
+禄
+禅
+禍
+禎
+福
+禛
+禦
+禧
+禪
+禮
+禱
+禹
+禺
+离
+禽
+禾
+禿
+秀
+私
+秃
+秆
+秉
+秋
+种
+科
+秒
+秘
+租
+秣
+秤
+秦
+秧
+秩
+秭
+积
+称
+秸
+移
+秽
+稀
+稅
+程
+稍
+税
+稔
+稗
+稚
+稜
+稞
+稟
+稠
+稣
+種
+稱
+稲
+稳
+稷
+稹
+稻
+稼
+稽
+稿
+穀
+穂
+穆
+穌
+積
+穎
+穗
+穢
+穩
+穫
+穴
+究
+穷
+穹
+空
+穿
+突
+窃
+窄
+窈
+窍
+窑
+窒
+窓
+窕
+窖
+窗
+窘
+窜
+窝
+窟
+窠
+窥
+窦
+窨
+窩
+窪
+窮
+窯
+窺
+窿
+竄
+竅
+竇
+竊
+立
+竖
+站
+竜
+竞
+竟
+章
+竣
+童
+竭
+端
+競
+竹
+竺
+竽
+竿
+笃
+笆
+笈
+笋
+笏
+笑
+笔
+笙
+笛
+笞
+笠
+符
+笨
+第
+笹
+笺
+笼
+筆
+等
+筊
+筋
+筍
+筏
+筐
+筑
+筒
+答
+策
+筛
+筝
+筠
+筱
+筲
+筵
+筷
+筹
+签
+简
+箇
+箋
+箍
+箏
+箐
+箔
+箕
+算
+箝
+管
+箩
+箫
+箭
+箱
+箴
+箸
+節
+篁
+範
+篆
+篇
+築
+篑
+篓
+篙
+篝
+篠
+篡
+篤
+篩
+篪
+篮
+篱
+篷
+簇
+簌
+簍
+簡
+簦
+簧
+簪
+簫
+簷
+簸
+簽
+簾
+簿
+籁
+籃
+籌
+籍
+籐
+籟
+籠
+籤
+籬
+籮
+籲
+米
+类
+籼
+籽
+粄
+粉
+粑
+粒
+粕
+粗
+粘
+粟
+粤
+粥
+粧
+粪
+粮
+粱
+粲
+粳
+粵
+粹
+粼
+粽
+精
+粿
+糅
+糊
+糍
+糕
+糖
+糗
+糙
+糜
+糞
+糟
+糠
+糧
+糬
+糯
+糰
+糸
+系
+糾
+紀
+紂
+約
+紅
+紉
+紊
+紋
+納
+紐
+紓
+純
+紗
+紘
+紙
+級
+紛
+紜
+素
+紡
+索
+紧
+紫
+紮
+累
+細
+紳
+紹
+紺
+終
+絃
+組
+絆
+経
+結
+絕
+絞
+絡
+絢
+給
+絨
+絮
+統
+絲
+絳
+絵
+絶
+絹
+綁
+綏
+綑
+經
+継
+続
+綜
+綠
+綢
+綦
+綫
+綬
+維
+綱
+網
+綴
+綵
+綸
+綺
+綻
+綽
+綾
+綿
+緊
+緋
+総
+緑
+緒
+緘
+線
+緝
+緞
+締
+緣
+編
+緩
+緬
+緯
+練
+緹
+緻
+縁
+縄
+縈
+縛
+縝
+縣
+縫
+縮
+縱
+縴
+縷
+總
+績
+繁
+繃
+繆
+繇
+繋
+織
+繕
+繚
+繞
+繡
+繩
+繪
+繫
+繭
+繳
+繹
+繼
+繽
+纂
+續
+纍
+纏
+纓
+纔
+纖
+纜
+纠
+红
+纣
+纤
+约
+级
+纨
+纪
+纫
+纬
+纭
+纯
+纰
+纱
+纲
+纳
+纵
+纶
+纷
+纸
+纹
+纺
+纽
+纾
+线
+绀
+练
+组
+绅
+细
+织
+终
+绊
+绍
+绎
+经
+绑
+绒
+结
+绔
+绕
+绘
+给
+绚
+绛
+络
+绝
+绞
+统
+绡
+绢
+绣
+绥
+绦
+继
+绩
+绪
+绫
+续
+绮
+绯
+绰
+绳
+维
+绵
+绶
+绷
+绸
+绻
+综
+绽
+绾
+绿
+缀
+缄
+缅
+缆
+缇
+缈
+缉
+缎
+缓
+缔
+缕
+编
+缘
+缙
+缚
+缜
+缝
+缠
+缢
+缤
+缥
+缨
+缩
+缪
+缭
+缮
+缰
+缱
+缴
+缸
+缺
+缽
+罂
+罄
+罌
+罐
+网
+罔
+罕
+罗
+罚
+罡
+罢
+罩
+罪
+置
+罰
+署
+罵
+罷
+罹
+羁
+羅
+羈
+羊
+羌
+美
+羔
+羚
+羞
+羟
+羡
+羣
+群
+羥
+羧
+羨
+義
+羯
+羲
+羸
+羹
+羽
+羿
+翁
+翅
+翊
+翌
+翎
+習
+翔
+翘
+翟
+翠
+翡
+翦
+翩
+翰
+翱
+翳
+翹
+翻
+翼
+耀
+老
+考
+耄
+者
+耆
+耋
+而
+耍
+耐
+耒
+耕
+耗
+耘
+耙
+耦
+耨
+耳
+耶
+耷
+耸
+耻
+耽
+耿
+聂
+聆
+聊
+聋
+职
+聒
+联
+聖
+聘
+聚
+聞
+聪
+聯
+聰
+聲
+聳
+聴
+聶
+職
+聽
+聾
+聿
+肃
+肄
+肅
+肆
+肇
+肉
+肋
+肌
+肏
+肓
+肖
+肘
+肚
+肛
+肝
+肠
+股
+肢
+肤
+肥
+肩
+肪
+肮
+肯
+肱
+育
+肴
+肺
+肽
+肾
+肿
+胀
+胁
+胃
+胄
+胆
+背
+胍
+胎
+胖
+胚
+胛
+胜
+胝
+胞
+胡
+胤
+胥
+胧
+胫
+胭
+胯
+胰
+胱
+胳
+胴
+胶
+胸
+胺
+能
+脂
+脅
+脆
+脇
+脈
+脉
+脊
+脍
+脏
+脐
+脑
+脓
+脖
+脘
+脚
+脛
+脣
+脩
+脫
+脯
+脱
+脲
+脳
+脸
+脹
+脾
+腆
+腈
+腊
+腋
+腌
+腎
+腐
+腑
+腓
+腔
+腕
+腥
+腦
+腩
+腫
+腭
+腮
+腰
+腱
+腳
+腴
+腸
+腹
+腺
+腻
+腼
+腾
+腿
+膀
+膈
+膊
+膏
+膑
+膘
+膚
+膛
+膜
+膝
+膠
+膦
+膨
+膩
+膳
+膺
+膻
+膽
+膾
+膿
+臀
+臂
+臃
+臆
+臉
+臊
+臍
+臓
+臘
+臟
+臣
+臥
+臧
+臨
+自
+臬
+臭
+至
+致
+臺
+臻
+臼
+臾
+舀
+舂
+舅
+舆
+與
+興
+舉
+舊
+舌
+舍
+舎
+舐
+舒
+舔
+舖
+舗
+舛
+舜
+舞
+舟
+航
+舫
+般
+舰
+舱
+舵
+舶
+舷
+舸
+船
+舺
+舾
+艇
+艋
+艘
+艙
+艦
+艮
+良
+艰
+艱
+色
+艳
+艷
+艹
+艺
+艾
+节
+芃
+芈
+芊
+芋
+芍
+芎
+芒
+芙
+芜
+芝
+芡
+芥
+芦
+芩
+芪
+芫
+芬
+芭
+芮
+芯
+花
+芳
+芷
+芸
+芹
+芻
+芽
+芾
+苁
+苄
+苇
+苋
+苍
+苏
+苑
+苒
+苓
+苔
+苕
+苗
+苛
+苜
+苞
+苟
+苡
+苣
+若
+苦
+苫
+苯
+英
+苷
+苹
+苻
+茁
+茂
+范
+茄
+茅
+茉
+茎
+茏
+茗
+茜
+茧
+茨
+茫
+茬
+茭
+茯
+茱
+茲
+茴
+茵
+茶
+茸
+茹
+茼
+荀
+荃
+荆
+草
+荊
+荏
+荐
+荒
+荔
+荖
+荘
+荚
+荞
+荟
+荠
+荡
+荣
+荤
+荥
+荧
+荨
+荪
+荫
+药
+荳
+荷
+荸
+荻
+荼
+荽
+莅
+莆
+莉
+莊
+莎
+莒
+莓
+莖
+莘
+莞
+莠
+莢
+莧
+莪
+莫
+莱
+莲
+莴
+获
+莹
+莺
+莽
+莿
+菀
+菁
+菅
+菇
+菈
+菊
+菌
+菏
+菓
+菖
+菘
+菜
+菟
+菠
+菡
+菩
+華
+菱
+菲
+菸
+菽
+萁
+萃
+萄
+萊
+萋
+萌
+萍
+萎
+萘
+萝
+萤
+营
+萦
+萧
+萨
+萩
+萬
+萱
+萵
+萸
+萼
+落
+葆
+葉
+著
+葚
+葛
+葡
+董
+葦
+葩
+葫
+葬
+葭
+葯
+葱
+葳
+葵
+葷
+葺
+蒂
+蒋
+蒐
+蒔
+蒙
+蒜
+蒞
+蒟
+蒡
+蒨
+蒲
+蒸
+蒹
+蒻
+蒼
+蒿
+蓁
+蓄
+蓆
+蓉
+蓋
+蓑
+蓓
+蓖
+蓝
+蓟
+蓦
+蓬
+蓮
+蓼
+蓿
+蔑
+蔓
+蔔
+蔗
+蔘
+蔚
+蔡
+蔣
+蔥
+蔫
+蔬
+蔭
+蔵
+蔷
+蔺
+蔻
+蔼
+蔽
+蕁
+蕃
+蕈
+蕉
+蕊
+蕎
+蕙
+蕤
+蕨
+蕩
+蕪
+蕭
+蕲
+蕴
+蕻
+蕾
+薄
+薅
+薇
+薈
+薊
+薏
+薑
+薔
+薙
+薛
+薦
+薨
+薩
+薪
+薬
+薯
+薰
+薹
+藉
+藍
+藏
+藐
+藓
+藕
+藜
+藝
+藤
+藥
+藩
+藹
+藻
+藿
+蘆
+蘇
+蘊
+蘋
+蘑
+蘚
+蘭
+蘸
+蘼
+蘿
+虎
+虏
+虐
+虑
+虔
+處
+虚
+虛
+虜
+虞
+號
+虢
+虧
+虫
+虬
+虱
+虹
+虻
+虽
+虾
+蚀
+蚁
+蚂
+蚊
+蚌
+蚓
+蚕
+蚜
+蚝
+蚣
+蚤
+蚩
+蚪
+蚯
+蚱
+蚵
+蛀
+蛆
+蛇
+蛊
+蛋
+蛎
+蛐
+蛔
+蛙
+蛛
+蛟
+蛤
+蛭
+蛮
+蛰
+蛳
+蛹
+蛻
+蛾
+蜀
+蜂
+蜃
+蜆
+蜇
+蜈
+蜊
+蜍
+蜒
+蜓
+蜕
+蜗
+蜘
+蜚
+蜜
+蜡
+蜢
+蜥
+蜱
+蜴
+蜷
+蜻
+蜿
+蝇
+蝈
+蝉
+蝌
+蝎
+蝕
+蝗
+蝙
+蝟
+蝠
+蝦
+蝨
+蝴
+蝶
+蝸
+蝼
+螂
+螃
+融
+螞
+螢
+螨
+螯
+螳
+螺
+蟀
+蟄
+蟆
+蟋
+蟎
+蟑
+蟒
+蟠
+蟬
+蟲
+蟹
+蟻
+蟾
+蠅
+蠍
+蠔
+蠕
+蠛
+蠟
+蠡
+蠢
+蠣
+蠱
+蠶
+蠹
+蠻
+血
+衄
+衅
+衆
+行
+衍
+術
+衔
+街
+衙
+衛
+衝
+衞
+衡
+衢
+衣
+补
+表
+衩
+衫
+衬
+衮
+衰
+衲
+衷
+衹
+衾
+衿
+袁
+袂
+袄
+袅
+袈
+袋
+袍
+袒
+袖
+袜
+袞
+袤
+袪
+被
+袭
+袱
+裁
+裂
+装
+裆
+裊
+裏
+裔
+裕
+裘
+裙
+補
+裝
+裟
+裡
+裤
+裨
+裱
+裳
+裴
+裸
+裹
+製
+裾
+褂
+複
+褐
+褒
+褓
+褔
+褚
+褥
+褪
+褫
+褲
+褶
+褻
+襁
+襄
+襟
+襠
+襪
+襬
+襯
+襲
+西
+要
+覃
+覆
+覇
+見
+規
+覓
+視
+覚
+覦
+覧
+親
+覬
+観
+覷
+覺
+覽
+觀
+见
+观
+规
+觅
+视
+览
+觉
+觊
+觎
+觐
+觑
+角
+觞
+解
+觥
+触
+觸
+言
+訂
+計
+訊
+討
+訓
+訕
+訖
+託
+記
+訛
+訝
+訟
+訣
+訥
+訪
+設
+許
+訳
+訴
+訶
+診
+註
+証
+詆
+詐
+詔
+評
+詛
+詞
+詠
+詡
+詢
+詣
+試
+詩
+詫
+詬
+詭
+詮
+詰
+話
+該
+詳
+詹
+詼
+誅
+誇
+誉
+誌
+認
+誓
+誕
+誘
+語
+誠
+誡
+誣
+誤
+誥
+誦
+誨
+說
+説
+読
+誰
+課
+誹
+誼
+調
+諄
+談
+請
+諏
+諒
+論
+諗
+諜
+諡
+諦
+諧
+諫
+諭
+諮
+諱
+諳
+諷
+諸
+諺
+諾
+謀
+謁
+謂
+謄
+謊
+謎
+謐
+謔
+謗
+謙
+講
+謝
+謠
+謨
+謬
+謹
+謾
+譁
+證
+譎
+譏
+識
+譙
+譚
+譜
+警
+譬
+譯
+議
+譲
+譴
+護
+譽
+讀
+變
+讓
+讚
+讞
+计
+订
+认
+讥
+讧
+讨
+让
+讪
+讫
+训
+议
+讯
+记
+讲
+讳
+讴
+讶
+讷
+许
+讹
+论
+讼
+讽
+设
+访
+诀
+证
+诃
+评
+诅
+识
+诈
+诉
+诊
+诋
+词
+诏
+译
+试
+诗
+诘
+诙
+诚
+诛
+话
+诞
+诟
+诠
+诡
+询
+诣
+诤
+该
+详
+诧
+诩
+诫
+诬
+语
+误
+诰
+诱
+诲
+说
+诵
+诶
+请
+诸
+诺
+读
+诽
+课
+诿
+谀
+谁
+调
+谄
+谅
+谆
+谈
+谊
+谋
+谌
+谍
+谎
+谏
+谐
+谑
+谒
+谓
+谔
+谕
+谗
+谘
+谙
+谚
+谛
+谜
+谟
+谢
+谣
+谤
+谥
+谦
+谧
+谨
+谩
+谪
+谬
+谭
+谯
+谱
+谲
+谴
+谶
+谷
+豁
+豆
+豇
+豈
+豉
+豊
+豌
+豎
+豐
+豔
+豚
+象
+豢
+豪
+豫
+豬
+豹
+豺
+貂
+貅
+貌
+貓
+貔
+貘
+貝
+貞
+負
+財
+貢
+貧
+貨
+販
+貪
+貫
+責
+貯
+貰
+貳
+貴
+貶
+買
+貸
+費
+貼
+貽
+貿
+賀
+賁
+賂
+賃
+賄
+資
+賈
+賊
+賑
+賓
+賜
+賞
+賠
+賡
+賢
+賣
+賤
+賦
+質
+賬
+賭
+賴
+賺
+購
+賽
+贅
+贈
+贊
+贍
+贏
+贓
+贖
+贛
+贝
+贞
+负
+贡
+财
+责
+贤
+败
+账
+货
+质
+贩
+贪
+贫
+贬
+购
+贮
+贯
+贰
+贱
+贲
+贴
+贵
+贷
+贸
+费
+贺
+贻
+贼
+贾
+贿
+赁
+赂
+赃
+资
+赅
+赈
+赊
+赋
+赌
+赎
+赏
+赐
+赓
+赔
+赖
+赘
+赚
+赛
+赝
+赞
+赠
+赡
+赢
+赣
+赤
+赦
+赧
+赫
+赭
+走
+赳
+赴
+赵
+赶
+起
+趁
+超
+越
+趋
+趕
+趙
+趟
+趣
+趨
+足
+趴
+趵
+趸
+趺
+趾
+跃
+跄
+跆
+跋
+跌
+跎
+跑
+跖
+跚
+跛
+距
+跟
+跡
+跤
+跨
+跩
+跪
+路
+跳
+践
+跷
+跹
+跺
+跻
+踉
+踊
+踌
+踏
+踐
+踝
+踞
+踟
+踢
+踩
+踪
+踮
+踱
+踴
+踵
+踹
+蹂
+蹄
+蹇
+蹈
+蹉
+蹊
+蹋
+蹑
+蹒
+蹙
+蹟
+蹣
+蹤
+蹦
+蹩
+蹬
+蹭
+蹲
+蹴
+蹶
+蹺
+蹼
+蹿
+躁
+躇
+躉
+躊
+躋
+躍
+躏
+躪
+身
+躬
+躯
+躲
+躺
+軀
+車
+軋
+軌
+軍
+軒
+軟
+転
+軸
+軼
+軽
+軾
+較
+載
+輒
+輓
+輔
+輕
+輛
+輝
+輟
+輩
+輪
+輯
+輸
+輻
+輾
+輿
+轄
+轅
+轆
+轉
+轍
+轎
+轟
+车
+轧
+轨
+轩
+转
+轭
+轮
+软
+轰
+轲
+轴
+轶
+轻
+轼
+载
+轿
+较
+辄
+辅
+辆
+辇
+辈
+辉
+辊
+辍
+辐
+辑
+输
+辕
+辖
+辗
+辘
+辙
+辛
+辜
+辞
+辟
+辣
+辦
+辨
+辩
+辫
+辭
+辮
+辯
+辰
+辱
+農
+边
+辺
+辻
+込
+辽
+达
+迁
+迂
+迄
+迅
+过
+迈
+迎
+运
+近
+返
+还
+这
+进
+远
+违
+连
+迟
+迢
+迤
+迥
+迦
+迩
+迪
+迫
+迭
+述
+迴
+迷
+迸
+迹
+迺
+追
+退
+送
+适
+逃
+逅
+逆
+选
+逊
+逍
+透
+逐
+递
+途
+逕
+逗
+這
+通
+逛
+逝
+逞
+速
+造
+逢
+連
+逮
+週
+進
+逵
+逶
+逸
+逻
+逼
+逾
+遁
+遂
+遅
+遇
+遊
+運
+遍
+過
+遏
+遐
+遑
+遒
+道
+達
+違
+遗
+遙
+遛
+遜
+遞
+遠
+遢
+遣
+遥
+遨
+適
+遭
+遮
+遲
+遴
+遵
+遶
+遷
+選
+遺
+遼
+遽
+避
+邀
+邁
+邂
+邃
+還
+邇
+邈
+邊
+邋
+邏
+邑
+邓
+邕
+邛
+邝
+邢
+那
+邦
+邨
+邪
+邬
+邮
+邯
+邰
+邱
+邳
+邵
+邸
+邹
+邺
+邻
+郁
+郅
+郊
+郎
+郑
+郜
+郝
+郡
+郢
+郤
+郦
+郧
+部
+郫
+郭
+郴
+郵
+郷
+郸
+都
+鄂
+鄉
+鄒
+鄔
+鄙
+鄞
+鄢
+鄧
+鄭
+鄰
+鄱
+鄲
+鄺
+酉
+酊
+酋
+酌
+配
+酐
+酒
+酗
+酚
+酝
+酢
+酣
+酥
+酩
+酪
+酬
+酮
+酯
+酰
+酱
+酵
+酶
+酷
+酸
+酿
+醃
+醇
+醉
+醋
+醍
+醐
+醒
+醚
+醛
+醜
+醞
+醣
+醪
+醫
+醬
+醮
+醯
+醴
+醺
+釀
+釁
+采
+釉
+释
+釋
+里
+重
+野
+量
+釐
+金
+釗
+釘
+釜
+針
+釣
+釦
+釧
+釵
+鈀
+鈉
+鈍
+鈎
+鈔
+鈕
+鈞
+鈣
+鈦
+鈪
+鈴
+鈺
+鈾
+鉀
+鉄
+鉅
+鉉
+鉑
+鉗
+鉚
+鉛
+鉤
+鉴
+鉻
+銀
+銃
+銅
+銑
+銓
+銖
+銘
+銜
+銬
+銭
+銮
+銳
+銷
+銹
+鋁
+鋅
+鋒
+鋤
+鋪
+鋰
+鋸
+鋼
+錄
+錐
+錘
+錚
+錠
+錢
+錦
+錨
+錫
+錮
+錯
+録
+錳
+錶
+鍊
+鍋
+鍍
+鍛
+鍥
+鍰
+鍵
+鍺
+鍾
+鎂
+鎊
+鎌
+鎏
+鎔
+鎖
+鎗
+鎚
+鎧
+鎬
+鎮
+鎳
+鏈
+鏖
+鏗
+鏘
+鏞
+鏟
+鏡
+鏢
+鏤
+鏽
+鐘
+鐮
+鐲
+鐳
+鐵
+鐸
+鐺
+鑄
+鑊
+鑑
+鑒
+鑣
+鑫
+鑰
+鑲
+鑼
+鑽
+鑾
+鑿
+针
+钉
+钊
+钎
+钏
+钒
+钓
+钗
+钙
+钛
+钜
+钝
+钞
+钟
+钠
+钡
+钢
+钣
+钤
+钥
+钦
+钧
+钨
+钩
+钮
+钯
+钰
+钱
+钳
+钴
+钵
+钺
+钻
+钼
+钾
+钿
+铀
+铁
+铂
+铃
+铄
+铅
+铆
+铉
+铎
+铐
+铛
+铜
+铝
+铠
+铡
+铢
+铣
+铤
+铨
+铩
+铬
+铭
+铮
+铰
+铲
+铵
+银
+铸
+铺
+链
+铿
+销
+锁
+锂
+锄
+锅
+锆
+锈
+锉
+锋
+锌
+锏
+锐
+锑
+错
+锚
+锟
+锡
+锢
+锣
+锤
+锥
+锦
+锭
+键
+锯
+锰
+锲
+锵
+锹
+锺
+锻
+镀
+镁
+镂
+镇
+镉
+镌
+镍
+镐
+镑
+镕
+镖
+镗
+镛
+镜
+镣
+镭
+镯
+镰
+镳
+镶
+長
+长
+門
+閃
+閉
+開
+閎
+閏
+閑
+閒
+間
+閔
+閘
+閡
+関
+閣
+閥
+閨
+閩
+閱
+閲
+閹
+閻
+閾
+闆
+闇
+闊
+闌
+闍
+闔
+闕
+闖
+闘
+關
+闡
+闢
+门
+闪
+闫
+闭
+问
+闯
+闰
+闲
+间
+闵
+闷
+闸
+闹
+闺
+闻
+闽
+闾
+阀
+阁
+阂
+阅
+阆
+阇
+阈
+阉
+阎
+阐
+阑
+阔
+阕
+阖
+阙
+阚
+阜
+队
+阡
+阪
+阮
+阱
+防
+阳
+阴
+阵
+阶
+阻
+阿
+陀
+陂
+附
+际
+陆
+陇
+陈
+陋
+陌
+降
+限
+陕
+陛
+陝
+陞
+陟
+陡
+院
+陣
+除
+陨
+险
+陪
+陰
+陲
+陳
+陵
+陶
+陷
+陸
+険
+陽
+隅
+隆
+隈
+隊
+隋
+隍
+階
+随
+隐
+隔
+隕
+隘
+隙
+際
+障
+隠
+隣
+隧
+隨
+險
+隱
+隴
+隶
+隸
+隻
+隼
+隽
+难
+雀
+雁
+雄
+雅
+集
+雇
+雉
+雋
+雌
+雍
+雎
+雏
+雑
+雒
+雕
+雖
+雙
+雛
+雜
+雞
+離
+難
+雨
+雪
+雯
+雰
+雲
+雳
+零
+雷
+雹
+電
+雾
+需
+霁
+霄
+霆
+震
+霈
+霉
+霊
+霍
+霎
+霏
+霑
+霓
+霖
+霜
+霞
+霧
+霭
+霰
+露
+霸
+霹
+霽
+霾
+靂
+靄
+靈
+青
+靓
+靖
+静
+靚
+靛
+靜
+非
+靠
+靡
+面
+靥
+靦
+革
+靳
+靴
+靶
+靼
+鞅
+鞋
+鞍
+鞏
+鞑
+鞘
+鞠
+鞣
+鞦
+鞭
+韆
+韋
+韌
+韓
+韜
+韦
+韧
+韩
+韬
+韭
+音
+韵
+韶
+韻
+響
+頁
+頂
+頃
+項
+順
+須
+頌
+預
+頑
+頒
+頓
+頗
+領
+頜
+頡
+頤
+頫
+頭
+頰
+頷
+頸
+頹
+頻
+頼
+顆
+題
+額
+顎
+顏
+顔
+願
+顛
+類
+顧
+顫
+顯
+顱
+顴
+页
+顶
+顷
+项
+顺
+须
+顼
+顽
+顾
+顿
+颁
+颂
+预
+颅
+领
+颇
+颈
+颉
+颊
+颌
+颍
+颐
+频
+颓
+颔
+颖
+颗
+题
+颚
+颛
+颜
+额
+颞
+颠
+颡
+颢
+颤
+颦
+颧
+風
+颯
+颱
+颳
+颶
+颼
+飄
+飆
+风
+飒
+飓
+飕
+飘
+飙
+飚
+飛
+飞
+食
+飢
+飨
+飩
+飪
+飯
+飲
+飼
+飽
+飾
+餃
+餅
+餉
+養
+餌
+餐
+餒
+餓
+餘
+餚
+餛
+餞
+餡
+館
+餮
+餵
+餾
+饅
+饈
+饋
+饌
+饍
+饑
+饒
+饕
+饗
+饞
+饥
+饨
+饪
+饬
+饭
+饮
+饯
+饰
+饱
+饲
+饴
+饵
+饶
+饷
+饺
+饼
+饽
+饿
+馀
+馁
+馄
+馅
+馆
+馈
+馋
+馍
+馏
+馒
+馔
+首
+馗
+香
+馥
+馨
+馬
+馭
+馮
+馳
+馴
+駁
+駄
+駅
+駆
+駐
+駒
+駕
+駛
+駝
+駭
+駱
+駿
+騁
+騎
+騏
+験
+騙
+騨
+騰
+騷
+驀
+驅
+驊
+驍
+驒
+驕
+驗
+驚
+驛
+驟
+驢
+驥
+马
+驭
+驮
+驯
+驰
+驱
+驳
+驴
+驶
+驷
+驸
+驹
+驻
+驼
+驾
+驿
+骁
+骂
+骄
+骅
+骆
+骇
+骈
+骊
+骋
+验
+骏
+骐
+骑
+骗
+骚
+骛
+骜
+骞
+骠
+骡
+骤
+骥
+骧
+骨
+骯
+骰
+骶
+骷
+骸
+骼
+髂
+髅
+髋
+髏
+髒
+髓
+體
+髖
+高
+髦
+髪
+髮
+髯
+髻
+鬃
+鬆
+鬍
+鬓
+鬚
+鬟
+鬢
+鬣
+鬥
+鬧
+鬱
+鬼
+魁
+魂
+魄
+魅
+魇
+魍
+魏
+魔
+魘
+魚
+魯
+魷
+鮑
+鮨
+鮪
+鮭
+鮮
+鯉
+鯊
+鯖
+鯛
+鯨
+鯰
+鯽
+鰍
+鰓
+鰭
+鰲
+鰻
+鰾
+鱈
+鱉
+鱔
+鱗
+鱷
+鱸
+鱼
+鱿
+鲁
+鲈
+鲍
+鲑
+鲛
+鲜
+鲟
+鲢
+鲤
+鲨
+鲫
+鲱
+鲲
+鲶
+鲷
+鲸
+鳃
+鳄
+鳅
+鳌
+鳍
+鳕
+鳖
+鳗
+鳝
+鳞
+鳥
+鳩
+鳳
+鳴
+鳶
+鴉
+鴕
+鴛
+鴦
+鴨
+鴻
+鴿
+鵑
+鵜
+鵝
+鵡
+鵬
+鵰
+鵲
+鶘
+鶩
+鶯
+鶴
+鷗
+鷲
+鷹
+鷺
+鸚
+鸞
+鸟
+鸠
+鸡
+鸢
+鸣
+鸥
+鸦
+鸨
+鸪
+鸭
+鸯
+鸳
+鸵
+鸽
+鸾
+鸿
+鹂
+鹃
+鹄
+鹅
+鹈
+鹉
+鹊
+鹌
+鹏
+鹑
+鹕
+鹘
+鹜
+鹞
+鹤
+鹦
+鹧
+鹫
+鹭
+鹰
+鹳
+鹵
+鹹
+鹼
+鹽
+鹿
+麂
+麋
+麒
+麓
+麗
+麝
+麟
+麥
+麦
+麩
+麴
+麵
+麸
+麺
+麻
+麼
+麽
+麾
+黃
+黄
+黍
+黎
+黏
+黑
+黒
+黔
+默
+黛
+黜
+黝
+點
+黠
+黨
+黯
+黴
+鼋
+鼎
+鼐
+鼓
+鼠
+鼬
+鼹
+鼻
+鼾
+齁
+齊
+齋
+齐
+齒
+齡
+齢
+齣
+齦
+齿
+龄
+龅
+龈
+龊
+龋
+龌
+龍
+龐
+龔
+龕
+龙
+龚
+龛
+龜
+龟
+︰
+︱
+︶
+︿
+﹁
+﹂
+﹍
+﹏
+﹐
+﹑
+﹒
+﹔
+﹕
+﹖
+﹗
+﹙
+﹚
+﹝
+﹞
+﹡
+﹣
+！
+＂
+＃
+＄
+％
+＆
+＇
+（
+）
+＊
+＋
+，
+－
+．
+／
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＝
+＞
+？
+＠
+［
+＼
+］
+＾
+＿
+｀
+ａ
+ｂ
+ｃ
+ｄ
+ｅ
+ｆ
+ｇ
+ｈ
+ｉ
+ｊ
+ｋ
+ｌ
+ｍ
+ｎ
+ｏ
+ｐ
+ｑ
+ｒ
+ｓ
+ｔ
+ｕ
+ｖ
+ｗ
+ｘ
+ｙ
+ｚ
+｛
+｜
+｝
+～
+｡
+｢
+｣
+､
+･
+ｯ
+ｰ
+ｲ
+ｸ
+ｼ
+ｽ
+ﾄ
+ﾉ
+ﾌ
+ﾗ
+ﾙ
+ﾝ
+ﾞ
+ﾟ
+￣
+￥
+👍
+🔥
+😂
+😎
+...
+yam
+10
+2017
+12
+11
+2016
+20
+30
+15
+06
+lofter
+##s
+2015
+by
+16
+14
+18
+13
+24
+17
+2014
+21
+##0
+22
+19
+25
+23
+com
+100
+00
+05
+2013
+##a
+03
+09
+08
+28
+##2
+50
+01
+04
+##1
+27
+02
+2012
+##3
+26
+##e
+07
+##8
+##5
+##6
+##4
+##9
+##7
+29
+2011
+40
+##t
+2010
+##o
+##d
+##i
+2009
+##n
+app
+www
+the
+##m
+31
+##c
+##l
+##y
+##r
+##g
+2008
+60
+http
+200
+qq
+##p
+80
+##f
+google
+pixnet
+90
+cookies
+tripadvisor
+500
+##er
+##k
+35
+##h
+facebook
+2007
+2000
+70
+##b
+of
+##x
+##u
+45
+300
+iphone
+32
+1000
+2006
+48
+ip
+36
+in
+38
+3d
+##w
+##ing
+55
+ctrip
+##on
+##v
+33
+##の
+to
+34
+400
+id
+2005
+it
+37
+windows
+llc
+top
+99
+42
+39
+000
+led
+at
+##an
+41
+51
+52
+46
+49
+43
+53
+44
+##z
+android
+58
+and
+59
+2004
+56
+vr
+##か
+5000
+2003
+47
+blogthis
+twitter
+54
+##le
+150
+ok
+2018
+57
+75
+cn
+no
+ios
+##in
+##mm
+##00
+800
+on
+te
+3000
+65
+2001
+360
+95
+ig
+lv
+120
+##ng
+##を
+##us
+##に
+pc
+てす
+──
+600
+##te
+85
+2002
+88
+##ed
+html
+ncc
+wifi
+email
+64
+blog
+is
+##10
+##て
+mail
+online
+##al
+dvd
+##ic
+studio
+##は
+##℃
+##ia
+##と
+line
+vip
+72
+##q
+98
+##ce
+##en
+for
+##is
+##ra
+##es
+##j
+usb
+net
+cp
+1999
+asia
+4g
+##cm
+diy
+new
+3c
+##お
+ta
+66
+language
+vs
+apple
+tw
+86
+web
+##ne
+ipad
+62
+you
+##re
+101
+68
+##tion
+ps
+de
+bt
+pony
+atm
+##2017
+1998
+67
+##ch
+ceo
+##or
+go
+##na
+av
+pro
+cafe
+96
+pinterest
+97
+63
+pixstyleme3c
+##ta
+more
+said
+##2016
+1997
+mp3
+700
+##ll
+nba
+jun
+##20
+92
+tv
+1995
+pm
+61
+76
+nbsp
+250
+##ie
+linux
+##ma
+cd
+110
+hd
+##17
+78
+##ion
+77
+6000
+am
+##th
+##st
+94
+##se
+##et
+69
+180
+gdp
+my
+105
+81
+abc
+89
+flash
+79
+one
+93
+1990
+1996
+##ck
+gps
+##も
+##ly
+web885
+106
+2020
+91
+##ge
+4000
+1500
+xd
+boss
+isbn
+1994
+org
+##ry
+me
+love
+##11
+0fork
+73
+##12
+3g
+##ter
+##ar
+71
+82
+##la
+hotel
+130
+1970
+pk
+83
+87
+140
+ie
+##os
+##30
+##el
+74
+##50
+seo
+cpu
+##ml
+p2p
+84
+may
+##る
+sun
+tue
+internet
+cc
+posted
+youtube
+##at
+##ン
+##man
+ii
+##ル
+##15
+abs
+nt
+pdf
+yahoo
+ago
+1980
+##it
+news
+mac
+104
+##てす
+##me
+##り
+java
+1992
+spa
+##de
+##nt
+hk
+all
+plus
+la
+1993
+##mb
+##16
+##ve
+west
+##da
+160
+air
+##い
+##ps
+から
+##to
+1989
+logo
+htc
+php
+https
+fi
+momo
+##son
+sat
+##ke
+##80
+ebd
+suv
+wi
+day
+apk
+##88
+##um
+mv
+galaxy
+wiki
+or
+brake
+##ス
+1200
+する
+this
+1991
+mon
+##こ
+❤2017
+po
+##ない
+javascript
+life
+home
+june
+##ss
+system
+900
+##ー
+##０
+pp
+1988
+world
+fb
+4k
+br
+##as
+ic
+ai
+leonardo
+safari
+##60
+live
+free
+xx
+wed
+win7
+kiehl
+##co
+lg
+o2o
+##go
+us
+235
+1949
+mm
+しい
+vfm
+kanye
+##90
+##2015
+##id
+jr
+##ey
+123
+rss
+##sa
+##ro
+##am
+##no
+thu
+fri
+350
+##sh
+##ki
+103
+comments
+name
+##のて
+##pe
+##ine
+max
+1987
+8000
+uber
+##mi
+##ton
+wordpress
+office
+1986
+1985
+##ment
+107
+bd
+win10
+##ld
+##li
+gmail
+bb
+dior
+##rs
+##ri
+##rd
+##ます
+up
+cad
+##®
+dr
+して
+read
+##21
+をお
+##io
+##99
+url
+1984
+pvc
+paypal
+show
+policy
+##40
+##ty
+##18
+with
+##★
+##01
+txt
+102
+##ba
+dna
+from
+post
+mini
+ar
+taiwan
+john
+##ga
+privacy
+agoda
+##13
+##ny
+word
+##24
+##22
+##by
+##ur
+##hz
+1982
+##ang
+265
+cookie
+netscape
+108
+##ka
+##～
+##ad
+house
+share
+note
+ibm
+code
+hello
+nike
+sim
+survey
+##016
+1979
+1950
+wikia
+##32
+##017
+5g
+cbc
+##tor
+##kg
+1983
+##rt
+##14
+campaign
+store
+2500
+os
+##ct
+##ts
+##°
+170
+api
+##ns
+365
+excel
+##な
+##ao
+##ら
+##し
+～～
+##nd
+university
+163
+には
+518
+##70
+##ya
+##il
+##25
+pierre
+ipo
+0020
+897
+##23
+hotels
+##ian
+のお
+125
+years
+6606
+##ers
+##26
+high
+##day
+time
+##ay
+bug
+##line
+##く
+##す
+##be
+xp
+talk2yam
+yamservice
+10000
+coco
+##dy
+sony
+##ies
+1978
+microsoft
+david
+people
+##ha
+1960
+instagram
+intel
+その
+##ot
+iso
+1981
+##va
+115
+##mo
+##land
+xxx
+man
+co
+ltxsw
+##ation
+baby
+220
+##pa
+##ol
+1945
+7000
+tag
+450
+##ue
+msn
+##31
+oppo
+##ト
+##ca
+control
+##om
+st
+chrome
+##ure
+##ん
+be
+##き
+lol
+##19
+した
+##bo
+240
+lady
+##100
+##way
+##から
+4600
+##ko
+##do
+##un
+4s
+corporation
+168
+##ni
+herme
+##28
+ｃｐ
+978
+##up
+##06
+ui
+##ds
+ppt
+admin
+three
+します
+bbc
+re
+128
+##48
+ca
+##015
+##35
+hp
+##ee
+tpp
+##た
+##ive
+××
+root
+##cc
+##ました
+##ble
+##ity
+adobe
+park
+114
+et
+oled
+city
+##ex
+##ler
+##ap
+china
+##book
+20000
+view
+##ice
+global
+##km
+your
+hong
+##mg
+out
+##ms
+ng
+ebay
+##29
+menu
+ubuntu
+##cy
+rom
+##view
+open
+ktv
+do
+server
+##lo
+if
+english
+##ね
+##５
+##oo
+1600
+##02
+step1
+kong
+club
+135
+july
+inc
+1976
+mr
+hi
+##net
+touch
+##ls
+##ii
+michael
+lcd
+##05
+##33
+phone
+james
+step2
+1300
+ios9
+##box
+dc
+##２
+##ley
+samsung
+111
+280
+pokemon
+css
+##ent
+##les
+いいえ
+##１
+s8
+atom
+play
+bmw
+##said
+sa
+etf
+ctrl
+♥yoyo♥
+##55
+2025
+##2014
+##66
+adidas
+amazon
+1958
+##ber
+##ner
+visa
+##77
+##der
+1800
+connectivity
+##hi
+firefox
+109
+118
+hr
+so
+style
+mark
+pop
+ol
+skip
+1975
+as
+##27
+##ir
+##61
+190
+mba
+##う
+##ai
+le
+##ver
+1900
+cafe2017
+lte
+super
+113
+129
+##ron
+amd
+like
+##☆
+are
+##ster
+we
+##sk
+paul
+data
+international
+##ft
+longchamp
+ssd
+good
+##ート
+##ti
+reply
+##my
+↓↓↓
+apr
+star
+##ker
+source
+136
+js
+112
+get
+force
+photo
+##one
+126
+##2013
+##ow
+link
+bbs
+1972
+goods
+##lin
+python
+119
+##ip
+game
+##ics
+##ません
+blue
+##●
+520
+##45
+page
+itunes
+##03
+1955
+260
+1968
+gt
+gif
+618
+##ff
+##47
+group
+くたさい
+about
+bar
+ganji
+##nce
+music
+lee
+not
+1977
+1971
+1973
+##per
+an
+faq
+comment
+##って
+days
+##ock
+116
+##bs
+1974
+1969
+v1
+player
+1956
+xbox
+sql
+fm
+f1
+139
+##ah
+210
+##lv
+##mp
+##000
+melody
+1957
+##３
+550
+17life
+199
+1966
+xml
+market
+##au
+##71
+999
+##04
+what
+gl
+##95
+##age
+tips
+##68
+book
+##ting
+mysql
+can
+1959
+230
+##ung
+wonderland
+watch
+10℃
+##ction
+9000
+mar
+mobile
+1946
+1962
+article
+##db
+part
+▲top
+party
+って
+1967
+1964
+1948
+##07
+##ore
+##op
+この
+dj
+##78
+##38
+010
+main
+225
+1965
+##ong
+art
+320
+ad
+134
+020
+##73
+117
+pm2
+japan
+228
+##08
+ts
+1963
+##ica
+der
+sm
+##36
+2019
+##wa
+ct
+##７
+##や
+##64
+1937
+homemesh
+search
+##85
+##れは
+##tv
+##di
+macbook
+##９
+##くたさい
+service
+##♥
+type
+った
+750
+##ier
+##si
+##75
+##います
+##ok
+best
+##ット
+goris
+lock
+##った
+cf
+3m
+big
+##ut
+ftp
+carol
+##vi
+１０
+1961
+happy
+sd
+##ac
+122
+anti
+pe
+cnn
+iii
+1920
+138
+##ラ
+1940
+esp
+jan
+tags
+##98
+##51
+august
+vol
+##86
+154
+##™
+##fs
+##れ
+##sion
+design
+ac
+##ム
+press
+jordan
+ppp
+that
+key
+check
+##６
+##tt
+##㎡
+1080p
+##lt
+power
+##42
+1952
+##bc
+vivi
+##ック
+he
+133
+121
+jpg
+##rry
+201
+175
+3500
+1947
+nb
+##ted
+##rn
+しています
+1954
+usd
+##t00
+master
+##ンク
+001
+model
+##58
+al
+##09
+1953
+##34
+ram
+goo
+ても
+##ui
+127
+1930
+red
+##ary
+rpg
+item
+##pm
+##41
+270
+##za
+project
+##2012
+hot
+td
+blogabstract
+##ger
+##62
+650
+##44
+gr2
+##します
+##ｍ
+black
+electronic
+nfc
+year
+asus
+また
+html5
+cindy
+##hd
+m3
+132
+esc
+##od
+booking
+##53
+fed
+tvb
+##81
+##ina
+mit
+165
+##いる
+chan
+192
+distribution
+next
+になる
+peter
+bios
+steam
+cm
+1941
+にも
+pk10
+##ix
+##65
+##91
+dec
+nasa
+##ana
+icecat
+00z
+b1
+will
+##46
+li
+se
+##ji
+##み
+##ard
+oct
+##ain
+jp
+##ze
+##bi
+cio
+##56
+smart
+h5
+##39
+##port
+curve
+vpn
+##nm
+##dia
+utc
+##あり
+12345678910
+##52
+rmvb
+chanel
+a4
+miss
+##and
+##im
+media
+who
+##63
+she
+girl
+5s
+124
+vera
+##して
+class
+vivo
+king
+##フ
+##ei
+national
+ab
+1951
+5cm
+888
+145
+ipod
+ap
+1100
+5mm
+211
+ms
+2756
+##69
+mp4
+msci
+##po
+##89
+131
+mg
+index
+380
+##bit
+##out
+##zz
+##97
+##67
+158
+apec
+##８
+photoshop
+opec
+￥799
+ては
+##96
+##tes
+##ast
+2g
+○○
+##ール
+￥2899
+##ling
+##よ
+##ory
+1938
+##ical
+kitty
+content
+##43
+step3
+##cn
+win8
+155
+vc
+1400
+iphone7
+robert
+##した
+tcl
+137
+beauty
+##87
+en
+dollars
+##ys
+##oc
+step
+pay
+yy
+a1
+##2011
+##lly
+##ks
+##♪
+1939
+188
+download
+1944
+sep
+exe
+ph
+います
+school
+gb
+center
+pr
+street
+##board
+uv
+##37
+##lan
+winrar
+##que
+##ua
+##com
+1942
+1936
+480
+gpu
+##４
+ettoday
+fu
+tom
+##54
+##ren
+##via
+149
+##72
+b2b
+144
+##79
+##tch
+rose
+arm
+mb
+##49
+##ial
+##nn
+nvidia
+step4
+mvp
+00㎡
+york
+156
+##イ
+how
+cpi
+591
+2765
+gov
+kg
+joe
+##xx
+mandy
+pa
+##ser
+copyright
+fashion
+1935
+don
+##け
+ecu
+##ist
+##art
+erp
+wap
+have
+##lm
+talk
+##ek
+##ning
+##if
+ch
+##ite
+video
+1943
+cs
+san
+iot
+look
+##84
+##2010
+##ku
+october
+##ux
+trump
+##hs
+##ide
+box
+141
+first
+##ins
+april
+##ight
+##83
+185
+angel
+protected
+aa
+151
+162
+x1
+m2
+##fe
+##×
+##ho
+size
+143
+min
+ofo
+fun
+gomaji
+ex
+hdmi
+food
+dns
+march
+chris
+kevin
+##のか
+##lla
+##pp
+##ec
+ag
+ems
+6s
+720p
+##rm
+##ham
+off
+##92
+asp
+team
+fandom
+ed
+299
+▌♥
+##ell
+info
+されています
+##82
+sina
+4066
+161
+##able
+##ctor
+330
+399
+315
+dll
+rights
+ltd
+idc
+jul
+3kg
+1927
+142
+ma
+surface
+##76
+##ク
+～～～
+304
+mall
+eps
+146
+green
+##59
+map
+space
+donald
+v2
+sodu
+##light
+1931
+148
+1700
+まて
+310
+reserved
+htm
+##han
+##57
+2d
+178
+mod
+##ise
+##tions
+152
+ti
+##shi
+doc
+1933
+icp
+055
+wang
+##ram
+shopping
+aug
+##pi
+##well
+now
+wam
+b2
+からお
+##hu
+236
+1928
+##gb
+266
+f2
+##93
+153
+mix
+##ef
+##uan
+bwl
+##plus
+##res
+core
+##ess
+tea
+5℃
+hktvmall
+nhk
+##ate
+list
+##ese
+301
+feb
+4m
+inn
+ての
+nov
+159
+12345
+daniel
+##ci
+pass
+##bet
+##nk
+coffee
+202
+ssl
+airbnb
+##ute
+fbi
+woshipm
+skype
+ea
+cg
+sp
+##fc
+##www
+yes
+edge
+alt
+007
+##94
+fpga
+##ght
+##gs
+iso9001
+さい
+##ile
+##wood
+##uo
+image
+lin
+icon
+american
+##em
+1932
+set
+says
+##king
+##tive
+blogger
+##74
+なと
+256
+147
+##ox
+##zy
+##red
+##ium
+##lf
+nokia
+claire
+##リ
+##ding
+november
+lohas
+##500
+##tic
+##マ
+##cs
+##ある
+##che
+##ire
+##gy
+##ult
+db
+january
+win
+##カ
+166
+road
+ptt
+##ま
+##つ
+198
+##fa
+##mer
+anna
+pchome
+はい
+udn
+ef
+420
+##time
+##tte
+2030
+##ア
+g20
+white
+かかります
+1929
+308
+garden
+eleven
+di
+##おります
+chen
+309b
+777
+172
+young
+cosplay
+ちてない
+4500
+bat
+##123
+##tra
+##ては
+kindle
+npc
+steve
+etc
+##ern
+##｜
+call
+xperia
+ces
+travel
+sk
+s7
+##ous
+1934
+##int
+みいたたけます
+183
+edu
+file
+cho
+qr
+##car
+##our
+186
+##ant
+##ｄ
+eric
+1914
+rends
+##jo
+##する
+mastercard
+##2000
+kb
+##min
+290
+##ino
+vista
+##ris
+##ud
+jack
+2400
+##set
+169
+pos
+1912
+##her
+##ou
+taipei
+しく
+205
+beta
+##ませんか
+232
+##fi
+express
+255
+body
+##ill
+aphojoy
+user
+december
+meiki
+##ick
+tweet
+richard
+##av
+##ᆫ
+iphone6
+##dd
+ちてすか
+views
+##mark
+321
+pd
+##００
+times
+##▲
+level
+##ash
+10g
+point
+5l
+##ome
+208
+koreanmall
+##ak
+george
+q2
+206
+wma
+tcp
+##200
+スタッフ
+full
+mlb
+##lle
+##watch
+tm
+run
+179
+911
+smith
+business
+##und
+1919
+color
+##tal
+222
+171
+##less
+moon
+4399
+##rl
+update
+pcb
+shop
+499
+157
+little
+なし
+end
+##mhz
+van
+dsp
+easy
+660
+##house
+##key
+history
+##ｏ
+oh
+##001
+##hy
+##web
+oem
+let
+was
+##2009
+##gg
+review
+##wan
+182
+##°c
+203
+uc
+title
+##val
+united
+233
+2021
+##ons
+doi
+trivago
+overdope
+sbs
+##ance
+##ち
+grand
+special
+573032185
+imf
+216
+wx17house
+##so
+##ーム
+audi
+##he
+london
+william
+##rp
+##ake
+science
+beach
+cfa
+amp
+ps4
+880
+##800
+##link
+##hp
+crm
+ferragamo
+bell
+make
+##eng
+195
+under
+zh
+photos
+2300
+##style
+##ント
+via
+176
+da
+##gi
+company
+i7
+##ray
+thomas
+370
+ufo
+i5
+##max
+plc
+ben
+back
+research
+8g
+173
+mike
+##pc
+##ッフ
+september
+189
+##ace
+vps
+february
+167
+pantos
+wp
+lisa
+1921
+★★
+jquery
+night
+long
+offer
+##berg
+##news
+1911
+##いて
+ray
+fks
+wto
+せます
+over
+164
+340
+##all
+##rus
+1924
+##888
+##works
+blogtitle
+loftpermalink
+##→
+187
+martin
+test
+ling
+km
+##め
+15000
+fda
+v3
+##ja
+##ロ
+ｗedding
+かある
+outlet
+family
+##ea
+をこ
+##top
+story
+##ness
+salvatore
+##lu
+204
+swift
+215
+room
+している
+oracle
+##ul
+1925
+sam
+b2c
+week
+pi
+rock
+##のは
+##ａ
+##けと
+##ean
+##300
+##gle
+cctv
+after
+chinese
+##back
+powered
+x2
+##tan
+1918
+##nes
+##イン
+canon
+only
+181
+##zi
+##las
+say
+##oe
+184
+##sd
+221
+##bot
+##world
+##zo
+sky
+made
+top100
+just
+1926
+pmi
+802
+234
+gap
+##vr
+177
+les
+174
+▲topoct
+ball
+vogue
+vi
+ing
+ofweek
+cos
+##list
+##ort
+▲topmay
+##なら
+##lon
+として
+last
+##tc
+##of
+##bus
+##gen
+real
+eva
+##コ
+a3
+nas
+##lie
+##ria
+##coin
+##bt
+▲topapr
+his
+212
+cat
+nata
+vive
+health
+⋯⋯
+drive
+sir
+▲topmar
+du
+cup
+##カー
+##ook
+##よう
+##sy
+alex
+msg
+tour
+しました
+3ce
+##word
+193
+ebooks
+r8
+block
+318
+##より
+2200
+nice
+pvp
+207
+months
+1905
+rewards
+##ther
+1917
+0800
+##xi
+##チ
+##sc
+micro
+850
+gg
+blogfp
+op
+1922
+daily
+m1
+264
+true
+##bb
+ml
+##tar
+##のお
+##ky
+anthony
+196
+253
+##yo
+state
+218
+##ara
+##aa
+##rc
+##tz
+##ston
+より
+gear
+##eo
+##ade
+ge
+see
+1923
+##win
+##ura
+ss
+heart
+##den
+##ita
+down
+##sm
+el
+png
+2100
+610
+rakuten
+whatsapp
+bay
+dream
+add
+##use
+680
+311
+pad
+gucci
+mpv
+##ode
+##fo
+island
+▲topjun
+##▼
+223
+jason
+214
+chicago
+##❤
+しの
+##hone
+io
+##れる
+##ことか
+sogo
+be2
+##ology
+990
+cloud
+vcd
+##con
+2～3
+##ford
+##joy
+##kb
+##こさいます
+##rade
+but
+##ach
+docker
+##ful
+rfid
+ul
+##ase
+hit
+ford
+##star
+580
+##○
+１１
+a2
+sdk
+reading
+edited
+##are
+cmos
+##mc
+238
+siri
+light
+##ella
+##ため
+bloomberg
+##read
+pizza
+##ison
+jimmy
+##vm
+college
+node
+journal
+ba
+18k
+##play
+245
+##cer
+２０
+magic
+##yu
+191
+jump
+288
+tt
+##ings
+asr
+##lia
+3200
+step5
+network
+##cd
+mc
+いします
+1234
+pixstyleme
+273
+##600
+2800
+money
+★★★★★
+1280
+１２
+430
+bl
+みの
+act
+##tus
+tokyo
+##rial
+##life
+emba
+##ae
+saas
+tcs
+##rk
+##wang
+summer
+##sp
+ko
+##ving
+390
+premium
+##その
+netflix
+##ヒ
+uk
+mt
+##lton
+right
+frank
+two
+209
+える
+##ple
+##cal
+021
+##んな
+##sen
+##ville
+hold
+nexus
+dd
+##ius
+てお
+##mah
+##なく
+tila
+zero
+820
+ce
+##tin
+resort
+##ws
+charles
+old
+p10
+5d
+report
+##360
+##ru
+##には
+bus
+vans
+lt
+##est
+pv
+##レ
+links
+rebecca
+##ツ
+##dm
+azure
+##365
+きな
+limited
+bit
+4gb
+##mon
+1910
+moto
+##eam
+213
+1913
+var
+eos
+なとの
+226
+blogspot
+された
+699
+e3
+dos
+dm
+fc
+##ments
+##ik
+##kw
+boy
+##bin
+##ata
+960
+er
+##せ
+219
+##vin
+##tu
+##ula
+194
+##∥
+station
+##ろ
+##ature
+835
+files
+zara
+hdr
+top10
+nature
+950
+magazine
+s6
+marriott
+##シ
+avira
+case
+##っと
+tab
+##ran
+tony
+##home
+oculus
+im
+##ral
+jean
+saint
+cry
+307
+rosie
+##force
+##ini
+ice
+##bert
+のある
+##nder
+##mber
+pet
+2600
+##◆
+plurk
+▲topdec
+##sis
+00kg
+▲topnov
+720
+##ence
+tim
+##ω
+##nc
+##ても
+##name
+log
+ips
+great
+ikea
+malaysia
+unix
+##イト
+3600
+##ncy
+##nie
+12000
+akb48
+##ye
+##oid
+404
+##chi
+##いた
+oa
+xuehai
+##1000
+##orm
+##rf
+275
+さん
+##ware
+##リー
+980
+ho
+##pro
+text
+##era
+560
+bob
+227
+##ub
+##2008
+8891
+scp
+avi
+##zen
+2022
+mi
+wu
+museum
+qvod
+apache
+lake
+jcb
+▲topaug
+★★★
+ni
+##hr
+hill
+302
+ne
+weibo
+490
+ruby
+##ーシ
+##ヶ
+##row
+4d
+▲topjul
+iv
+##ish
+github
+306
+mate
+312
+##スト
+##lot
+##ane
+andrew
+のハイト
+##tina
+t1
+rf
+ed2k
+##vel
+##900
+way
+final
+りの
+ns
+5a
+705
+197
+##メ
+sweet
+bytes
+##ene
+▲topjan
+231
+##cker
+##2007
+##px
+100g
+topapp
+229
+helpapp
+rs
+low
+14k
+g4g
+care
+630
+ldquo
+あり
+##fork
+leave
+rm
+edition
+##gan
+##zon
+##qq
+▲topsep
+##google
+##ism
+gold
+224
+explorer
+##zer
+toyota
+category
+select
+visual
+##labels
+restaurant
+##md
+posts
+s1
+##ico
+もっと
+angelababy
+123456
+217
+sports
+s3
+mbc
+1915
+してくたさい
+shell
+x86
+candy
+##new
+kbs
+face
+xl
+470
+##here
+4a
+swissinfo
+v8
+▲topfeb
+dram
+##ual
+##vice
+3a
+##wer
+sport
+q1
+ios10
+public
+int
+card
+##ｃ
+ep
+au
+rt
+##れた
+1080
+bill
+##mll
+kim
+３０
+460
+wan
+##uk
+##ミ
+x3
+298
+0t
+scott
+##ming
+239
+e5
+##3d
+h7n9
+worldcat
+brown
+##あります
+##vo
+##led
+##580
+##ax
+249
+410
+##ert
+paris
+##～6
+polo
+925
+##lr
+599
+##ナ
+capital
+##hing
+bank
+cv
+1g
+##chat
+##ｓ
+##たい
+adc
+##ule
+2m
+##ｅ
+digital
+hotmail
+268
+##pad
+870
+bbq
+quot
+##ring
+before
+wali
+##まて
+mcu
+2k
+2b
+という
+costco
+316
+north
+333
+switch
+##city
+##ｐ
+philips
+##mann
+management
+panasonic
+##cl
+##vd
+##ping
+##rge
+alice
+##lk
+##ましょう
+css3
+##ney
+vision
+alpha
+##ular
+##400
+##tter
+lz
+にお
+##ありません
+mode
+gre
+1916
+pci
+##tm
+237
+1～2
+##yan
+##そ
+について
+##let
+##キ
+work
+war
+coach
+ah
+mary
+##ᅵ
+huang
+##pt
+a8
+pt
+follow
+##berry
+1895
+##ew
+a5
+ghost
+##ション
+##wn
+##og
+south
+##code
+girls
+##rid
+action
+villa
+git
+r11
+table
+games
+##cket
+error
+##anonymoussaid
+##ag
+here
+##ame
+##gc
+qa
+##■
+##lis
+gmp
+##gin
+vmalife
+##cher
+yu
+wedding
+##tis
+demo
+dragon
+530
+soho
+social
+bye
+##rant
+river
+orz
+acer
+325
+##↑
+##ース
+##ats
+261
+del
+##ven
+440
+ups
+##ように
+##ター
+305
+value
+macd
+yougou
+##dn
+661
+##ano
+ll
+##urt
+##rent
+continue
+script
+##wen
+##ect
+paper
+263
+319
+shift
+##chel
+##フト
+##cat
+258
+x5
+fox
+243
+##さん
+car
+aaa
+##blog
+loading
+##yn
+##tp
+kuso
+799
+si
+sns
+イカせるテンマ
+ヒンクテンマ3
+rmb
+vdc
+forest
+central
+prime
+help
+ultra
+##rmb
+##ような
+241
+square
+688
+##しい
+のないフロクに
+##field
+##reen
+##ors
+##ju
+c1
+start
+510
+##air
+##map
+cdn
+##wo
+cba
+stephen
+m8
+100km
+##get
+opera
+##base
+##ood
+vsa
+com™
+##aw
+##ail
+251
+なのて
+count
+t2
+##ᅡ
+##een
+2700
+hop
+##gp
+vsc
+tree
+##eg
+##ose
+816
+285
+##ories
+##shop
+alphago
+v4
+1909
+simon
+##ᆼ
+fluke62max
+zip
+スホンサー
+##sta
+louis
+cr
+bas
+##～10
+bc
+##yer
+hadoop
+##ube
+##wi
+1906
+0755
+hola
+##low
+place
+centre
+5v
+d3
+##fer
+252
+##750
+##media
+281
+540
+0l
+exchange
+262
+series
+##ハー
+##san
+eb
+##bank
+##ｋ
+q3
+##nge
+##mail
+take
+##lp
+259
+1888
+client
+east
+cache
+event
+vincent
+##ールを
+きを
+##nse
+sui
+855
+adchoice
+##и
+##stry
+##なたの
+246
+##zone
+ga
+apps
+sea
+##ab
+248
+cisco
+##タ
+##rner
+kymco
+##care
+dha
+##pu
+##yi
+minkoff
+royal
+p1
+への
+annie
+269
+collection
+kpi
+playstation
+257
+になります
+866
+bh
+##bar
+queen
+505
+radio
+1904
+andy
+armani
+##xy
+manager
+iherb
+##ery
+##share
+spring
+raid
+johnson
+1908
+##ob
+volvo
+hall
+##ball
+v6
+our
+taylor
+##hk
+bi
+242
+##cp
+kate
+bo
+water
+technology
+##rie
+サイトは
+277
+##ona
+##sl
+hpv
+303
+gtx
+hip
+rdquo
+jayz
+stone
+##lex
+##rum
+namespace
+##やり
+620
+##ale
+##atic
+des
+##erson
+##ql
+##ves
+##type
+enter
+##この
+##てきます
+d2
+##168
+##mix
+##bian
+との
+a9
+jj
+ky
+##lc
+access
+movie
+##hc
+リストに
+tower
+##ration
+##mit
+ます
+##nch
+ua
+tel
+prefix
+##o2
+1907
+##point
+1901
+ott
+～10
+##http
+##ury
+baidu
+##ink
+member
+##logy
+bigbang
+nownews
+##js
+##shot
+##tb
+##こと
+247
+eba
+##tics
+##lus
+ける
+v5
+spark
+##ama
+there
+##ions
+god
+##lls
+##down
+hiv
+##ress
+burberry
+day2
+##kv
+◆◆
+jeff
+related
+film
+edit
+joseph
+283
+##ark
+cx
+32gb
+order
+g9
+30000
+##ans
+##tty
+s5
+##bee
+かあります
+thread
+xr
+buy
+sh
+005
+land
+spotify
+mx
+##ari
+276
+##verse
+×email
+sf
+why
+##ことて
+244
+7headlines
+nego
+sunny
+dom
+exo
+401
+666
+positioning
+fit
+rgb
+##tton
+278
+kiss
+alexa
+adam
+lp
+みリストを
+##ｇ
+mp
+##ties
+##llow
+amy
+##du
+np
+002
+institute
+271
+##rth
+##lar
+2345
+590
+##des
+sidebar
+１５
+imax
+site
+##cky
+##kit
+##ime
+##009
+season
+323
+##fun
+##ンター
+##ひ
+gogoro
+a7
+pu
+lily
+fire
+twd600
+##ッセーシを
+いて
+##vis
+30ml
+##cture
+##をお
+information
+##オ
+close
+friday
+##くれる
+yi
+nick
+てすか
+##tta
+##tel
+6500
+##lock
+cbd
+economy
+254
+かお
+267
+tinker
+double
+375
+8gb
+voice
+##app
+oops
+channel
+today
+985
+##right
+raw
+xyz
+##＋
+jim
+edm
+##cent
+7500
+supreme
+814
+ds
+##its
+##asia
+dropbox
+##てすか
+##tti
+books
+272
+100ml
+##tle
+##ller
+##ken
+##more
+##boy
+sex
+309
+##dom
+t3
+##ider
+##なります
+##unch
+1903
+810
+feel
+5500
+##かった
+##put
+により
+s2
+mo
+##gh
+men
+ka
+amoled
+div
+##tr
+##n1
+port
+howard
+##tags
+ken
+dnf
+##nus
+adsense
+##а
+ide
+##へ
+buff
+thunder
+##town
+##ique
+has
+##body
+auto
+pin
+##erry
+tee
+てした
+295
+number
+##the
+##013
+object
+psp
+cool
+udnbkk
+16gb
+##mic
+miui
+##tro
+most
+r2
+##alk
+##nity
+1880
+±0
+##いました
+428
+s4
+law
+version
+##oa
+n1
+sgs
+docomo
+##tf
+##ack
+henry
+fc2
+##ded
+##sco
+##014
+##rite
+286
+0mm
+linkedin
+##ada
+##now
+wii
+##ndy
+ucbug
+##◎
+sputniknews
+legalminer
+##ika
+##xp
+2gb
+##bu
+q10
+oo
+b6
+come
+##rman
+cheese
+ming
+maker
+##gm
+nikon
+##fig
+ppi
+kelly
+##ります
+jchere
+てきます
+ted
+md
+003
+fgo
+tech
+##tto
+dan
+soc
+##gl
+##len
+hair
+earth
+640
+521
+img
+##pper
+##a1
+##てきる
+##ロク
+acca
+##ition
+##ference
+suite
+##ig
+outlook
+##mond
+##cation
+398
+##pr
+279
+101vip
+358
+##999
+282
+64gb
+3800
+345
+airport
+##over
+284
+##おり
+jones
+##ith
+lab
+##su
+##いるのて
+co2
+town
+piece
+##llo
+no1
+vmware
+24h
+##qi
+focus
+reader
+##admin
+##ora
+tb
+false
+##log
+1898
+know
+lan
+838
+##ces
+f4
+##ume
+motel
+stop
+##oper
+na
+flickr
+netcomponents
+##af
+##─
+pose
+williams
+local
+##ound
+##cg
+##site
+##iko
+いお
+274
+5m
+gsm
+con
+##ath
+1902
+friends
+##hip
+cell
+317
+##rey
+780
+cream
+##cks
+012
+##dp
+facebooktwitterpinterestgoogle
+sso
+324
+shtml
+song
+swiss
+##mw
+##キンク
+lumia
+xdd
+string
+tiffany
+522
+marc
+られた
+insee
+russell
+sc
+dell
+##ations
+ｏｋ
+camera
+289
+##vs
+##flow
+##late
+classic
+287
+##nter
+stay
+g1
+mtv
+512
+##ever
+##lab
+##nger
+qe
+sata
+ryan
+d1
+50ml
+cms
+##cing
+su
+292
+3300
+editor
+296
+##nap
+security
+sunday
+association
+##ens
+##700
+##bra
+acg
+##かり
+sofascore
+とは
+mkv
+##ign
+jonathan
+gary
+build
+labels
+##oto
+tesla
+moba
+qi
+gohappy
+general
+ajax
+1024
+##かる
+サイト
+society
+##test
+##urs
+wps
+fedora
+##ich
+mozilla
+328
+##480
+##dr
+usa
+urn
+##lina
+##ｒ
+grace
+##die
+##try
+##ader
+1250
+##なり
+elle
+570
+##chen
+##ᆯ
+price
+##ten
+uhz
+##ough
+eq
+##hen
+states
+push
+session
+balance
+wow
+506
+##cus
+##py
+when
+##ward
+##ep
+34e
+wong
+library
+prada
+##サイト
+##cle
+running
+##ree
+313
+ck
+date
+q4
+##ctive
+##ool
+##＞
+mk
+##ira
+##163
+388
+die
+secret
+rq
+dota
+buffet
+は１ヶ
+e6
+##ez
+pan
+368
+ha
+##card
+##cha
+2a
+##さ
+alan
+day3
+eye
+f3
+##end
+france
+keep
+adi
+rna
+tvbs
+##ala
+solo
+nova
+##え
+##tail
+##ょう
+support
+##ries
+##なる
+##ved
+base
+copy
+iis
+fps
+##ways
+hero
+hgih
+profile
+fish
+mu
+ssh
+entertainment
+chang
+##wd
+click
+cake
+##ond
+pre
+##tom
+kic
+pixel
+##ov
+##fl
+product
+6a
+##pd
+dear
+##gate
+es
+yumi
+audio
+##²
+##sky
+echo
+bin
+where
+##ture
+329
+##ape
+find
+sap
+isis
+##なと
+nand
+##101
+##load
+##ream
+band
+a6
+525
+never
+##post
+festival
+50cm
+##we
+555
+guide
+314
+zenfone
+##ike
+335
+gd
+forum
+jessica
+strong
+alexander
+##ould
+software
+allen
+##ious
+program
+360°
+else
+lohasthree
+##gar
+することかてきます
+please
+##れます
+rc
+##ggle
+##ric
+bim
+50000
+##own
+eclipse
+355
+brian
+3ds
+##side
+061
+361
+##other
+##ける
+##tech
+##ator
+485
+engine
+##ged
+##ｔ
+plaza
+##fit
+cia
+ngo
+westbrook
+shi
+tbs
+50mm
+##みませんか
+sci
+291
+reuters
+##ily
+contextlink
+##hn
+af
+##cil
+bridge
+very
+##cel
+1890
+cambridge
+##ize
+15g
+##aid
+##data
+790
+frm
+##head
+award
+butler
+##sun
+meta
+##mar
+america
+ps3
+puma
+pmid
+##すか
+lc
+670
+kitchen
+##lic
+オーフン5
+きなしソフトサーヒス
+そして
+day1
+future
+★★★★
+##text
+##page
+##rris
+pm1
+##ket
+fans
+##っています
+1001
+christian
+bot
+kids
+trackback
+##hai
+c3
+display
+##hl
+n2
+1896
+idea
+さんも
+##sent
+airmail
+##ug
+##men
+pwm
+けます
+028
+##lution
+369
+852
+awards
+schemas
+354
+asics
+wikipedia
+font
+##tional
+##vy
+c2
+293
+##れている
+##dget
+##ein
+っている
+contact
+pepper
+スキル
+339
+##～5
+294
+##uel
+##ument
+730
+##hang
+みてす
+q5
+##sue
+rain
+##ndi
+wei
+swatch
+##cept
+わせ
+331
+popular
+##ste
+##tag
+p2
+501
+trc
+1899
+##west
+##live
+justin
+honda
+ping
+messenger
+##rap
+v9
+543
+##とは
+unity
+appqq
+はすへて
+025
+leo
+##tone
+##テ
+##ass
+uniqlo
+##010
+502
+her
+jane
+memory
+moneydj
+##tical
+human
+12306
+していると
+##m2
+coc
+miacare
+##mn
+tmt
+##core
+vim
+kk
+##may
+fan
+target
+use
+too
+338
+435
+2050
+867
+737
+fast
+##2c
+services
+##ope
+omega
+energy
+##わ
+pinkoi
+1a
+##なから
+##rain
+jackson
+##ement
+##シャンルの
+374
+366
+そんな
+p9
+rd
+##ᆨ
+1111
+##tier
+##vic
+zone
+##│
+385
+690
+dl
+isofix
+cpa
+m4
+322
+kimi
+めて
+davis
+##lay
+lulu
+##uck
+050
+weeks
+qs
+##hop
+920
+##ｎ
+ae
+##ear
+～5
+eia
+405
+##fly
+korea
+jpeg
+boost
+##ship
+small
+##リア
+1860
+eur
+297
+425
+valley
+##iel
+simple
+##ude
+rn
+k2
+##ena
+されます
+non
+patrick
+しているから
+##ナー
+feed
+5757
+30g
+process
+well
+qqmei
+##thing
+they
+aws
+lu
+pink
+##ters
+##kin
+または
+board
+##vertisement
+wine
+##ien
+unicode
+##dge
+r1
+359
+##tant
+いを
+##twitter
+##3c
+cool1
+される
+##れて
+##ｌ
+isp
+##012
+standard
+45㎡2
+402
+##150
+matt
+##fu
+326
+##iner
+googlemsn
+pixnetfacebookyahoo
+##ラン
+x7
+886
+##uce
+メーカー
+sao
+##ev
+##きました
+##file
+9678
+403
+xddd
+shirt
+6l
+##rio
+##hat
+3mm
+givenchy
+ya
+bang
+##lio
+monday
+crystal
+ロクイン
+##abc
+336
+head
+890
+ubuntuforumwikilinuxpastechat
+##vc
+##～20
+##rity
+cnc
+7866
+ipv6
+null
+1897
+##ost
+yang
+imsean
+tiger
+##fet
+##ンス
+352
+##＝
+dji
+327
+ji
+maria
+##come
+##んて
+foundation
+3100
+##beth
+##なった
+1m
+601
+active
+##aft
+##don
+3p
+sr
+349
+emma
+##khz
+living
+415
+353
+1889
+341
+709
+457
+sas
+x6
+##face
+pptv
+x4
+##mate
+han
+sophie
+##jing
+337
+fifa
+##mand
+other
+sale
+inwedding
+##gn
+てきちゃいます
+##mmy
+##pmlast
+bad
+nana
+nbc
+してみてくたさいね
+なとはお
+##wu
+##かあります
+##あ
+note7
+single
+##340
+せからこ
+してくたさい♪この
+しにはとんとんワークケートを
+するとあなたにもっとマッチした
+ならワークケートへ
+もみつかっちゃうかも
+ワークケートの
+##bel
+window
+##dio
+##ht
+union
+age
+382
+１４
+##ivity
+##ｙ
+コメント
+domain
+neo
+##isa
+##lter
+5k
+f5
+steven
+##cts
+powerpoint
+tft
+self
+g2
+ft
+##テル
+zol
+##act
+mwc
+381
+343
+もう
+nbapop
+408
+てある
+eds
+ace
+##room
+previous
+author
+tomtom
+il
+##ets
+hu
+financial
+☆☆☆
+っています
+bp
+5t
+chi
+1gb
+##hg
+fairmont
+cross
+008
+gay
+h2
+function
+##けて
+356
+also
+1b
+625
+##ータ
+##raph
+1894
+3～5
+##ils
+i3
+334
+avenue
+##host
+による
+##bon
+##tsu
+message
+navigation
+50g
+fintech
+h6
+##ことを
+8cm
+##ject
+##vas
+##firm
+credit
+##wf
+xxxx
+form
+##nor
+##space
+huawei
+plan
+json
+sbl
+##dc
+machine
+921
+392
+wish
+##120
+##sol
+windows7
+edward
+##ために
+development
+washington
+##nsis
+lo
+818
+##sio
+##ym
+##bor
+planet
+##～8
+##wt
+ieee
+gpa
+##めて
+camp
+ann
+gm
+##tw
+##oka
+connect
+##rss
+##work
+##atus
+wall
+chicken
+soul
+2mm
+##times
+fa
+##ather
+##cord
+009
+##eep
+hitachi
+gui
+harry
+##pan
+e1
+disney
+##press
+##ーション
+wind
+386
+frigidaire
+##tl
+liu
+hsu
+332
+basic
+von
+ev
+いた
+てきる
+スホンサーサイト
+learning
+##ull
+expedia
+archives
+change
+##wei
+santa
+cut
+ins
+6gb
+turbo
+brand
+cf1
+508
+004
+return
+747
+##rip
+h1
+##nis
+##をこ
+128gb
+##にお
+3t
+application
+しており
+emc
+rx
+##oon
+384
+quick
+412
+15058
+wilson
+wing
+chapter
+##bug
+beyond
+##cms
+##dar
+##oh
+zoom
+e2
+trip
+sb
+##nba
+rcep
+342
+aspx
+ci
+080
+gc
+gnu
+める
+##count
+advanced
+dance
+dv
+##url
+##ging
+367
+8591
+am09
+shadow
+battle
+346
+##ｉ
+##cia
+##という
+emily
+##のてす
+##tation
+host
+ff
+techorz
+sars
+##mini
+##mporary
+##ering
+nc
+4200
+798
+##next
+cma
+##mbps
+##gas
+##ift
+##dot
+##ィ
+455
+##～17
+amana
+##りの
+426
+##ros
+ir
+00㎡1
+##eet
+##ible
+##↓
+710
+ˋ▽ˊ
+##aka
+dcs
+iq
+##ｖ
+l1
+##lor
+maggie
+##011
+##iu
+588
+##～1
+830
+##gt
+1tb
+articles
+create
+##burg
+##iki
+database
+fantasy
+##rex
+##cam
+dlc
+dean
+##you
+hard
+path
+gaming
+victoria
+maps
+cb
+##lee
+##itor
+overchicstoretvhome
+systems
+##xt
+416
+p3
+sarah
+760
+##nan
+407
+486
+x9
+install
+second
+626
+##ann
+##ph
+##rcle
+##nic
+860
+##nar
+ec
+##とう
+768
+metro
+chocolate
+##rian
+～4
+##table
+##しています
+skin
+##sn
+395
+mountain
+##0mm
+inparadise
+6m
+7x24
+ib
+4800
+##jia
+eeworld
+creative
+g5
+g3
+357
+parker
+ecfa
+village
+からの
+18000
+sylvia
+サーヒス
+hbl
+##ques
+##onsored
+##x2
+##きます
+##v4
+##tein
+ie6
+383
+##stack
+389
+ver
+##ads
+##baby
+sound
+bbe
+##110
+##lone
+##uid
+ads
+022
+gundam
+351
+thinkpad
+006
+scrum
+match
+##ave
+mems
+##470
+##oy
+##なりました
+##talk
+glass
+lamigo
+span
+##eme
+job
+##a5
+jay
+wade
+kde
+498
+##lace
+ocean
+tvg
+##covery
+##r3
+##ners
+##rea
+junior
+think
+##aine
+cover
+##ision
+##sia
+↓↓
+##bow
+msi
+413
+458
+406
+##love
+711
+801
+soft
+z2
+##pl
+456
+1840
+mobil
+mind
+##uy
+427
+nginx
+##oi
+めた
+##rr
+6221
+##mple
+##sson
+##ーシてす
+371
+##nts
+91tv
+comhd
+crv3000
+##uard
+1868
+397
+deep
+lost
+field
+gallery
+##bia
+rate
+spf
+redis
+traction
+930
+icloud
+011
+なら
+fe
+jose
+372
+##tory
+into
+sohu
+fx
+899
+379
+kicstart2
+##hia
+すく
+##～3
+##sit
+ra
+２４
+##walk
+##xure
+500g
+##pact
+pacific
+xa
+natural
+carlo
+##250
+##walker
+1850
+##can
+cto
+gigi
+516
+##サー
+pen
+##hoo
+ob
+matlab
+##ｂ
+##yy
+13913459
+##iti
+mango
+##bbs
+sense
+c5
+oxford
+##ニア
+walker
+jennifer
+##ola
+course
+##bre
+701
+##pus
+##rder
+lucky
+075
+##ぁ
+ivy
+なお
+##nia
+sotheby
+side
+##ugh
+joy
+##orage
+##ush
+##bat
+##dt
+364
+r9
+##2d
+##gio
+511
+country
+wear
+##lax
+##～7
+##moon
+393
+seven
+study
+411
+348
+lonzo
+8k
+##ェ
+evolution
+##イフ
+##kk
+gs
+kd
+##レス
+arduino
+344
+b12
+##lux
+arpg
+##rdon
+cook
+##x5
+dark
+five
+##als
+##ida
+とても
+sign
+362
+##ちの
+something
+20mm
+##nda
+387
+##posted
+fresh
+tf
+1870
+422
+cam
+##mine
+##skip
+##form
+##ssion
+education
+394
+##tee
+dyson
+stage
+##jie
+want
+##night
+epson
+pack
+あります
+##ppy
+テリヘル
+##█
+wd
+##eh
+##rence
+left
+##lvin
+golden
+mhz
+discovery
+##trix
+##n2
+loft
+##uch
+##dra
+##sse
+speed
+～1
+1mdb
+sorry
+welcome
+##urn
+wave
+gaga
+##lmer
+teddy
+##160
+トラックハック
+せよ
+611
+##f2016
+378
+rp
+##sha
+rar
+##あなたに
+##きた
+840
+holiday
+##ュー
+373
+074
+##vg
+##nos
+##rail
+gartner
+gi
+6p
+##dium
+kit
+488
+b3
+eco
+##ろう
+20g
+sean
+##stone
+autocad
+nu
+##np
+f16
+write
+029
+m5
+##ias
+images
+atp
+##dk
+fsm
+504
+1350
+ve
+52kb
+##xxx
+##のに
+##cake
+414
+unit
+lim
+ru
+1v
+##ification
+published
+angela
+16g
+analytics
+ak
+##ｑ
+##nel
+gmt
+##icon
+again
+##₂
+##bby
+ios11
+445
+かこさいます
+waze
+いてす
+##ハ
+9985
+##ust
+##ティー
+framework
+##007
+iptv
+delete
+52sykb
+cl
+wwdc
+027
+30cm
+##fw
+##ての
+1389
+##xon
+brandt
+##ses
+##dragon
+tc
+vetements
+anne
+monte
+modern
+official
+##へて
+##ere
+##nne
+##oud
+もちろん
+５０
+etnews
+##a2
+##graphy
+421
+863
+##ちゃん
+444
+##rtex
+##てお
+l2
+##gma
+mount
+ccd
+たと
+archive
+morning
+tan
+ddos
+e7
+##ホ
+day4
+##ウ
+gis
+453
+its
+495
+factory
+bruce
+pg
+##ito
+ってくたさい
+guest
+cdma
+##lling
+536
+n3
+しかし
+3～4
+mega
+eyes
+ro
+１３
+women
+dac
+church
+##jun
+singapore
+##facebook
+6991
+starbucks
+##tos
+##stin
+##shine
+zen
+##mu
+tina
+20℃
+1893
+##たけて
+503
+465
+request
+##gence
+qt
+##っ
+1886
+347
+363
+q7
+##zzi
+diary
+##tore
+409
+##ead
+468
+cst
+##osa
+canada
+agent
+va
+##jiang
+##ちは
+##ーク
+##lam
+sg
+##nix
+##sday
+##よって
+g6
+##master
+bing
+##zl
+charlie
+１６
+8mm
+nb40
+##ーン
+thai
+##ルフ
+ln284ct
+##itz
+##2f
+bonnie
+##food
+##lent
+originals
+##stro
+##lts
+418
+∟∣
+##bscribe
+children
+ntd
+yesstyle
+##かも
+hmv
+##tment
+d5
+2cm
+arts
+sms
+##pn
+##я
+##いい
+topios9
+539
+lifestyle
+virtual
+##ague
+xz
+##deo
+muji
+024
+unt
+##nnis
+##ᅩ
+faq1
+1884
+396
+##ette
+fly
+64㎡
+はしめまして
+441
+curry
+##pop
+のこ
+release
+##←
+##◆◆
+##cast
+073
+ありな
+500ml
+##ews
+5c
+##stle
+ios7
+##ima
+787
+dog
+lenovo
+##r4
+roger
+013
+cbs
+vornado
+100m
+417
+##desk
+##クok
+##ald
+1867
+9595
+2900
+##van
+oil
+##ｘ
+some
+break
+common
+##jy
+##lines
+g7
+twice
+419
+ella
+nano
+belle
+にこ
+##mes
+##self
+##note
+jb
+##ことかてきます
+benz
+##との
+##ova
+451
+save
+##wing
+##ますのて
+kai
+りは
+##hua
+##rect
+rainer
+##unge
+448
+##0m
+adsl
+##かな
+guestname
+##uma
+##kins
+##zu
+tokichoi
+##price
+county
+##med
+##mus
+rmk
+391
+address
+vm
+えて
+openload
+##group
+##hin
+##iginal
+amg
+urban
+##oz
+jobs
+emi
+##public
+beautiful
+##sch
+album
+##dden
+##bell
+jerry
+works
+hostel
+miller
+##drive
+##rmin
+##１０
+376
+boot
+828
+##370
+##fx
+##cm～
+1885
+##nome
+##ctionary
+##oman
+##lish
+##cr
+##hm
+433
+##how
+432
+francis
+xi
+c919
+b5
+evernote
+##uc
+vga
+##3000
+coupe
+##urg
+##cca
+##uality
+019
+6g
+れる
+multi
+##また
+##ett
+em
+hey
+##ani
+##tax
+##rma
+inside
+than
+740
+leonnhurt
+##jin
+ict
+れた
+bird
+notes
+200mm
+くの
+##dical
+##lli
+result
+442
+iu
+ee
+438
+smap
+gopro
+##last
+yin
+pure
+998
+32g
+けた
+5kg
+##dan
+##rame
+mama
+##oot
+bean
+marketing
+##hur
+2l
+bella
+sync
+xuite
+##ground
+515
+discuz
+##getrelax
+##ince
+##bay
+##5s
+cj
+##イス
+gmat
+apt
+##pass
+jing
+##rix
+c4
+rich
+##とても
+niusnews
+##ello
+bag
+770
+##eting
+##mobile
+１８
+culture
+015
+##のてすか
+377
+1020
+area
+##ience
+616
+details
+gp
+universal
+silver
+dit
+はお
+private
+ddd
+u11
+kanshu
+##ified
+fung
+##nny
+dx
+##520
+tai
+475
+023
+##fr
+##lean
+3s
+##pin
+429
+##rin
+25000
+ly
+rick
+##bility
+usb3
+banner
+##baru
+##gion
+metal
+dt
+vdf
+1871
+karl
+qualcomm
+bear
+1010
+oldid
+ian
+jo
+##tors
+population
+##ernel
+1882
+mmorpg
+##mv
+##bike
+603
+##©
+ww
+friend
+##ager
+exhibition
+##del
+##pods
+fpx
+structure
+##free
+##tings
+kl
+##rley
+##copyright
+##mma
+california
+3400
+orange
+yoga
+4l
+canmake
+honey
+##anda
+##コメント
+595
+nikkie
+##ルハイト
+dhl
+publishing
+##mall
+##gnet
+20cm
+513
+##クセス
+##┅
+e88
+970
+##dog
+fishbase
+##!
+##"
+###
+##$
+##%
+##&
+##'
+##(
+##)
+##*
+##+
+##,
+##-
+##.
+##/
+##:
+##;
+##<
+##=
+##>
+##?
+##@
+##[
+##\
+##]
+##^
+##_
+##{
+##|
+##}
+##~
+##£
+##¤
+##¥
+##§
+##«
+##±
+##³
+##µ
+##·
+##¹
+##º
+##»
+##¼
+##ß
+##æ
+##÷
+##ø
+##đ
+##ŋ
+##ɔ
+##ə
+##ɡ
+##ʰ
+##ˇ
+##ˈ
+##ˊ
+##ˋ
+##ˍ
+##ː
+##˙
+##˚
+##ˢ
+##α
+##β
+##γ
+##δ
+##ε
+##η
+##θ
+##ι
+##κ
+##λ
+##μ
+##ν
+##ο
+##π
+##ρ
+##ς
+##σ
+##τ
+##υ
+##φ
+##χ
+##ψ
+##б
+##в
+##г
+##д
+##е
+##ж
+##з
+##к
+##л
+##м
+##н
+##о
+##п
+##р
+##с
+##т
+##у
+##ф
+##х
+##ц
+##ч
+##ш
+##ы
+##ь
+##і
+##ا
+##ب
+##ة
+##ت
+##د
+##ر
+##س
+##ع
+##ل
+##م
+##ن
+##ه
+##و
+##ي
+##۩
+##ก
+##ง
+##น
+##ม
+##ย
+##ร
+##อ
+##า
+##เ
+##๑
+##་
+##ღ
+##ᄀ
+##ᄁ
+##ᄂ
+##ᄃ
+##ᄅ
+##ᄆ
+##ᄇ
+##ᄈ
+##ᄉ
+##ᄋ
+##ᄌ
+##ᄎ
+##ᄏ
+##ᄐ
+##ᄑ
+##ᄒ
+##ᅢ
+##ᅣ
+##ᅥ
+##ᅦ
+##ᅧ
+##ᅨ
+##ᅪ
+##ᅬ
+##ᅭ
+##ᅮ
+##ᅯ
+##ᅲ
+##ᅳ
+##ᅴ
+##ᆷ
+##ᆸ
+##ᆺ
+##ᆻ
+##ᗜ
+##ᵃ
+##ᵉ
+##ᵍ
+##ᵏ
+##ᵐ
+##ᵒ
+##ᵘ
+##‖
+##„
+##†
+##•
+##‥
+##‧
+## 
+##‰
+##′
+##″
+##‹
+##›
+##※
+##‿
+##⁄
+##ⁱ
+##⁺
+##ⁿ
+##₁
+##₃
+##₄
+##€
+##№
+##ⅰ
+##ⅱ
+##ⅲ
+##ⅳ
+##ⅴ
+##↔
+##↗
+##↘
+##⇒
+##∀
+##−
+##∕
+##∙
+##√
+##∞
+##∟
+##∠
+##∣
+##∩
+##∮
+##∶
+##∼
+##∽
+##≈
+##≒
+##≡
+##≤
+##≥
+##≦
+##≧
+##≪
+##≫
+##⊙
+##⋅
+##⋈
+##⋯
+##⌒
+##①
+##②
+##③
+##④
+##⑤
+##⑥
+##⑦
+##⑧
+##⑨
+##⑩
+##⑴
+##⑵
+##⑶
+##⑷
+##⑸
+##⒈
+##⒉
+##⒊
+##⒋
+##ⓒ
+##ⓔ
+##ⓘ
+##━
+##┃
+##┆
+##┊
+##┌
+##└
+##├
+##┣
+##═
+##║
+##╚
+##╞
+##╠
+##╭
+##╮
+##╯
+##╰
+##╱
+##╳
+##▂
+##▃
+##▅
+##▇
+##▉
+##▋
+##▌
+##▍
+##▎
+##□
+##▪
+##▫
+##▬
+##△
+##▶
+##►
+##▽
+##◇
+##◕
+##◠
+##◢
+##◤
+##☀
+##☕
+##☞
+##☺
+##☼
+##♀
+##♂
+##♠
+##♡
+##♣
+##♦
+##♫
+##♬
+##✈
+##✔
+##✕
+##✖
+##✦
+##✨
+##✪
+##✰
+##✿
+##❀
+##➜
+##➤
+##⦿
+##、
+##。
+##〃
+##々
+##〇
+##〈
+##〉
+##《
+##》
+##「
+##」
+##『
+##』
+##【
+##】
+##〓
+##〔
+##〕
+##〖
+##〗
+##〜
+##〝
+##〞
+##ぃ
+##ぇ
+##ぬ
+##ふ
+##ほ
+##む
+##ゃ
+##ゅ
+##ゆ
+##ょ
+##゜
+##ゝ
+##ァ
+##ゥ
+##エ
+##ォ
+##ケ
+##サ
+##セ
+##ソ
+##ッ
+##ニ
+##ヌ
+##ネ
+##ノ
+##ヘ
+##モ
+##ャ
+##ヤ
+##ュ
+##ユ
+##ョ
+##ヨ
+##ワ
+##ヲ
+##・
+##ヽ
+##ㄅ
+##ㄆ
+##ㄇ
+##ㄉ
+##ㄋ
+##ㄌ
+##ㄍ
+##ㄎ
+##ㄏ
+##ㄒ
+##ㄚ
+##ㄛ
+##ㄞ
+##ㄟ
+##ㄢ
+##ㄤ
+##ㄥ
+##ㄧ
+##ㄨ
+##ㆍ
+##㈦
+##㊣
+##㗎
+##一
+##丁
+##七
+##万
+##丈
+##三
+##上
+##下
+##不
+##与
+##丐
+##丑
+##专
+##且
+##丕
+##世
+##丘
+##丙
+##业
+##丛
+##东
+##丝
+##丞
+##丟
+##両
+##丢
+##两
+##严
+##並
+##丧
+##丨
+##个
+##丫
+##中
+##丰
+##串
+##临
+##丶
+##丸
+##丹
+##为
+##主
+##丼
+##丽
+##举
+##丿
+##乂
+##乃
+##久
+##么
+##义
+##之
+##乌
+##乍
+##乎
+##乏
+##乐
+##乒
+##乓
+##乔
+##乖
+##乗
+##乘
+##乙
+##乜
+##九
+##乞
+##也
+##习
+##乡
+##书
+##乩
+##买
+##乱
+##乳
+##乾
+##亀
+##亂
+##了
+##予
+##争
+##事
+##二
+##于
+##亏
+##云
+##互
+##五
+##井
+##亘
+##亙
+##亚
+##些
+##亜
+##亞
+##亟
+##亡
+##亢
+##交
+##亥
+##亦
+##产
+##亨
+##亩
+##享
+##京
+##亭
+##亮
+##亲
+##亳
+##亵
+##人
+##亿
+##什
+##仁
+##仃
+##仄
+##仅
+##仆
+##仇
+##今
+##介
+##仍
+##从
+##仏
+##仑
+##仓
+##仔
+##仕
+##他
+##仗
+##付
+##仙
+##仝
+##仞
+##仟
+##代
+##令
+##以
+##仨
+##仪
+##们
+##仮
+##仰
+##仲
+##件
+##价
+##任
+##份
+##仿
+##企
+##伉
+##伊
+##伍
+##伎
+##伏
+##伐
+##休
+##伕
+##众
+##优
+##伙
+##会
+##伝
+##伞
+##伟
+##传
+##伢
+##伤
+##伦
+##伪
+##伫
+##伯
+##估
+##伴
+##伶
+##伸
+##伺
+##似
+##伽
+##佃
+##但
+##佇
+##佈
+##位
+##低
+##住
+##佐
+##佑
+##体
+##佔
+##何
+##佗
+##佘
+##余
+##佚
+##佛
+##作
+##佝
+##佞
+##佟
+##你
+##佢
+##佣
+##佤
+##佥
+##佩
+##佬
+##佯
+##佰
+##佳
+##併
+##佶
+##佻
+##佼
+##使
+##侃
+##侄
+##來
+##侈
+##例
+##侍
+##侏
+##侑
+##侖
+##侗
+##供
+##依
+##侠
+##価
+##侣
+##侥
+##侦
+##侧
+##侨
+##侬
+##侮
+##侯
+##侵
+##侶
+##侷
+##便
+##係
+##促
+##俄
+##俊
+##俎
+##俏
+##俐
+##俑
+##俗
+##俘
+##俚
+##保
+##俞
+##俟
+##俠
+##信
+##俨
+##俩
+##俪
+##俬
+##俭
+##修
+##俯
+##俱
+##俳
+##俸
+##俺
+##俾
+##倆
+##倉
+##個
+##倌
+##倍
+##倏
+##們
+##倒
+##倔
+##倖
+##倘
+##候
+##倚
+##倜
+##借
+##倡
+##値
+##倦
+##倩
+##倪
+##倫
+##倬
+##倭
+##倶
+##债
+##值
+##倾
+##偃
+##假
+##偈
+##偉
+##偌
+##偎
+##偏
+##偕
+##做
+##停
+##健
+##側
+##偵
+##偶
+##偷
+##偻
+##偽
+##偿
+##傀
+##傅
+##傍
+##傑
+##傘
+##備
+##傚
+##傢
+##傣
+##傥
+##储
+##傩
+##催
+##傭
+##傲
+##傳
+##債
+##傷
+##傻
+##傾
+##僅
+##働
+##像
+##僑
+##僕
+##僖
+##僚
+##僥
+##僧
+##僭
+##僮
+##僱
+##僵
+##價
+##僻
+##儀
+##儂
+##億
+##儆
+##儉
+##儋
+##儒
+##儕
+##儘
+##償
+##儡
+##優
+##儲
+##儷
+##儼
+##儿
+##兀
+##允
+##元
+##兄
+##充
+##兆
+##兇
+##先
+##光
+##克
+##兌
+##免
+##児
+##兑
+##兒
+##兔
+##兖
+##党
+##兜
+##兢
+##入
+##內
+##全
+##兩
+##八
+##公
+##六
+##兮
+##兰
+##共
+##兲
+##关
+##兴
+##兵
+##其
+##具
+##典
+##兹
+##养
+##兼
+##兽
+##冀
+##内
+##円
+##冇
+##冈
+##冉
+##冊
+##册
+##再
+##冏
+##冒
+##冕
+##冗
+##写
+##军
+##农
+##冠
+##冢
+##冤
+##冥
+##冨
+##冪
+##冬
+##冯
+##冰
+##冲
+##决
+##况
+##冶
+##冷
+##冻
+##冼
+##冽
+##冾
+##净
+##凄
+##准
+##凇
+##凈
+##凉
+##凋
+##凌
+##凍
+##减
+##凑
+##凛
+##凜
+##凝
+##几
+##凡
+##凤
+##処
+##凪
+##凭
+##凯
+##凰
+##凱
+##凳
+##凶
+##凸
+##凹
+##出
+##击
+##函
+##凿
+##刀
+##刁
+##刃
+##分
+##切
+##刈
+##刊
+##刍
+##刎
+##刑
+##划
+##列
+##刘
+##则
+##刚
+##创
+##初
+##删
+##判
+##別
+##刨
+##利
+##刪
+##别
+##刮
+##到
+##制
+##刷
+##券
+##刹
+##刺
+##刻
+##刽
+##剁
+##剂
+##剃
+##則
+##剉
+##削
+##剋
+##剌
+##前
+##剎
+##剐
+##剑
+##剔
+##剖
+##剛
+##剜
+##剝
+##剣
+##剤
+##剥
+##剧
+##剩
+##剪
+##副
+##割
+##創
+##剷
+##剽
+##剿
+##劃
+##劇
+##劈
+##劉
+##劊
+##劍
+##劏
+##劑
+##力
+##劝
+##办
+##功
+##加
+##务
+##劣
+##动
+##助
+##努
+##劫
+##劭
+##励
+##劲
+##劳
+##労
+##劵
+##効
+##劾
+##势
+##勁
+##勃
+##勇
+##勉
+##勋
+##勐
+##勒
+##動
+##勖
+##勘
+##務
+##勛
+##勝
+##勞
+##募
+##勢
+##勤
+##勧
+##勳
+##勵
+##勸
+##勺
+##勻
+##勾
+##勿
+##匀
+##包
+##匆
+##匈
+##匍
+##匐
+##匕
+##化
+##北
+##匙
+##匝
+##匠
+##匡
+##匣
+##匪
+##匮
+##匯
+##匱
+##匹
+##区
+##医
+##匾
+##匿
+##區
+##十
+##千
+##卅
+##升
+##午
+##卉
+##半
+##卍
+##华
+##协
+##卑
+##卒
+##卓
+##協
+##单
+##卖
+##南
+##単
+##博
+##卜
+##卞
+##卟
+##占
+##卡
+##卢
+##卤
+##卦
+##卧
+##卫
+##卮
+##卯
+##印
+##危
+##即
+##却
+##卵
+##卷
+##卸
+##卻
+##卿
+##厂
+##厄
+##厅
+##历
+##厉
+##压
+##厌
+##厕
+##厘
+##厚
+##厝
+##原
+##厢
+##厥
+##厦
+##厨
+##厩
+##厭
+##厮
+##厲
+##厳
+##去
+##县
+##叁
+##参
+##參
+##又
+##叉
+##及
+##友
+##双
+##反
+##収
+##发
+##叔
+##取
+##受
+##变
+##叙
+##叛
+##叟
+##叠
+##叡
+##叢
+##口
+##古
+##句
+##另
+##叨
+##叩
+##只
+##叫
+##召
+##叭
+##叮
+##可
+##台
+##叱
+##史
+##右
+##叵
+##叶
+##号
+##司
+##叹
+##叻
+##叼
+##叽
+##吁
+##吃
+##各
+##吆
+##合
+##吉
+##吊
+##吋
+##同
+##名
+##后
+##吏
+##吐
+##向
+##吒
+##吓
+##吕
+##吖
+##吗
+##君
+##吝
+##吞
+##吟
+##吠
+##吡
+##否
+##吧
+##吨
+##吩
+##含
+##听
+##吭
+##吮
+##启
+##吱
+##吳
+##吴
+##吵
+##吶
+##吸
+##吹
+##吻
+##吼
+##吽
+##吾
+##呀
+##呂
+##呃
+##呆
+##呈
+##告
+##呋
+##呎
+##呐
+##呓
+##呕
+##呗
+##员
+##呛
+##呜
+##呢
+##呤
+##呦
+##周
+##呱
+##呲
+##味
+##呵
+##呷
+##呸
+##呻
+##呼
+##命
+##咀
+##咁
+##咂
+##咄
+##咆
+##咋
+##和
+##咎
+##咏
+##咐
+##咒
+##咔
+##咕
+##咖
+##咗
+##咘
+##咙
+##咚
+##咛
+##咣
+##咤
+##咦
+##咧
+##咨
+##咩
+##咪
+##咫
+##咬
+##咭
+##咯
+##咱
+##咲
+##咳
+##咸
+##咻
+##咽
+##咿
+##哀
+##品
+##哂
+##哄
+##哆
+##哇
+##哈
+##哉
+##哋
+##哌
+##响
+##哎
+##哏
+##哐
+##哑
+##哒
+##哔
+##哗
+##哟
+##員
+##哥
+##哦
+##哧
+##哨
+##哩
+##哪
+##哭
+##哮
+##哲
+##哺
+##哼
+##哽
+##唁
+##唄
+##唆
+##唇
+##唉
+##唏
+##唐
+##唑
+##唔
+##唠
+##唤
+##唧
+##唬
+##售
+##唯
+##唰
+##唱
+##唳
+##唷
+##唸
+##唾
+##啃
+##啄
+##商
+##啉
+##啊
+##問
+##啓
+##啕
+##啖
+##啜
+##啞
+##啟
+##啡
+##啤
+##啥
+##啦
+##啧
+##啪
+##啫
+##啬
+##啮
+##啰
+##啱
+##啲
+##啵
+##啶
+##啷
+##啸
+##啻
+##啼
+##啾
+##喀
+##喂
+##喃
+##善
+##喆
+##喇
+##喉
+##喊
+##喋
+##喎
+##喏
+##喔
+##喘
+##喙
+##喚
+##喜
+##喝
+##喟
+##喧
+##喪
+##喫
+##喬
+##單
+##喰
+##喱
+##喲
+##喳
+##喵
+##営
+##喷
+##喹
+##喺
+##喻
+##喽
+##嗅
+##嗆
+##嗇
+##嗎
+##嗑
+##嗒
+##嗓
+##嗔
+##嗖
+##嗚
+##嗜
+##嗝
+##嗟
+##嗡
+##嗣
+##嗤
+##嗦
+##嗨
+##嗪
+##嗬
+##嗯
+##嗰
+##嗲
+##嗳
+##嗶
+##嗷
+##嗽
+##嘀
+##嘅
+##嘆
+##嘈
+##嘉
+##嘌
+##嘍
+##嘎
+##嘔
+##嘖
+##嘗
+##嘘
+##嘚
+##嘛
+##嘜
+##嘞
+##嘟
+##嘢
+##嘣
+##嘤
+##嘧
+##嘩
+##嘭
+##嘮
+##嘯
+##嘰
+##嘱
+##嘲
+##嘴
+##嘶
+##嘸
+##嘹
+##嘻
+##嘿
+##噁
+##噌
+##噎
+##噓
+##噔
+##噗
+##噙
+##噜
+##噠
+##噢
+##噤
+##器
+##噩
+##噪
+##噬
+##噱
+##噴
+##噶
+##噸
+##噹
+##噻
+##噼
+##嚀
+##嚇
+##嚎
+##嚏
+##嚐
+##嚓
+##嚕
+##嚟
+##嚣
+##嚥
+##嚨
+##嚮
+##嚴
+##嚷
+##嚼
+##囂
+##囉
+##囊
+##囍
+##囑
+##囔
+##囗
+##囚
+##四
+##囝
+##回
+##囟
+##因
+##囡
+##团
+##団
+##囤
+##囧
+##囪
+##囫
+##园
+##困
+##囱
+##囲
+##図
+##围
+##囹
+##固
+##国
+##图
+##囿
+##圃
+##圄
+##圆
+##圈
+##國
+##圍
+##圏
+##園
+##圓
+##圖
+##團
+##圜
+##土
+##圣
+##圧
+##在
+##圩
+##圭
+##地
+##圳
+##场
+##圻
+##圾
+##址
+##坂
+##均
+##坊
+##坍
+##坎
+##坏
+##坐
+##坑
+##块
+##坚
+##坛
+##坝
+##坞
+##坟
+##坠
+##坡
+##坤
+##坦
+##坨
+##坪
+##坯
+##坳
+##坵
+##坷
+##垂
+##垃
+##垄
+##型
+##垒
+##垚
+##垛
+##垠
+##垢
+##垣
+##垦
+##垩
+##垫
+##垭
+##垮
+##垵
+##埂
+##埃
+##埋
+##城
+##埔
+##埕
+##埗
+##域
+##埠
+##埤
+##埵
+##執
+##埸
+##培
+##基
+##埼
+##堀
+##堂
+##堃
+##堅
+##堆
+##堇
+##堑
+##堕
+##堙
+##堡
+##堤
+##堪
+##堯
+##堰
+##報
+##場
+##堵
+##堺
+##堿
+##塊
+##塌
+##塑
+##塔
+##塗
+##塘
+##塚
+##塞
+##塢
+##塩
+##填
+##塬
+##塭
+##塵
+##塾
+##墀
+##境
+##墅
+##墉
+##墊
+##墒
+##墓
+##増
+##墘
+##墙
+##墜
+##增
+##墟
+##墨
+##墩
+##墮
+##墳
+##墻
+##墾
+##壁
+##壅
+##壆
+##壇
+##壊
+##壑
+##壓
+##壕
+##壘
+##壞
+##壟
+##壢
+##壤
+##壩
+##士
+##壬
+##壮
+##壯
+##声
+##売
+##壳
+##壶
+##壹
+##壺
+##壽
+##处
+##备
+##変
+##复
+##夏
+##夔
+##夕
+##外
+##夙
+##多
+##夜
+##够
+##夠
+##夢
+##夥
+##大
+##天
+##太
+##夫
+##夭
+##央
+##夯
+##失
+##头
+##夷
+##夸
+##夹
+##夺
+##夾
+##奂
+##奄
+##奇
+##奈
+##奉
+##奋
+##奎
+##奏
+##奐
+##契
+##奔
+##奕
+##奖
+##套
+##奘
+##奚
+##奠
+##奢
+##奥
+##奧
+##奪
+##奬
+##奮
+##女
+##奴
+##奶
+##奸
+##她
+##好
+##如
+##妃
+##妄
+##妆
+##妇
+##妈
+##妊
+##妍
+##妒
+##妓
+##妖
+##妘
+##妙
+##妝
+##妞
+##妣
+##妤
+##妥
+##妨
+##妩
+##妪
+##妮
+##妲
+##妳
+##妹
+##妻
+##妾
+##姆
+##姉
+##姊
+##始
+##姍
+##姐
+##姑
+##姒
+##姓
+##委
+##姗
+##姚
+##姜
+##姝
+##姣
+##姥
+##姦
+##姨
+##姪
+##姫
+##姬
+##姹
+##姻
+##姿
+##威
+##娃
+##娄
+##娅
+##娆
+##娇
+##娉
+##娑
+##娓
+##娘
+##娛
+##娜
+##娟
+##娠
+##娣
+##娥
+##娩
+##娱
+##娲
+##娴
+##娶
+##娼
+##婀
+##婁
+##婆
+##婉
+##婊
+##婕
+##婚
+##婢
+##婦
+##婧
+##婪
+##婭
+##婴
+##婵
+##婶
+##婷
+##婺
+##婿
+##媒
+##媚
+##媛
+##媞
+##媧
+##媲
+##媳
+##媽
+##媾
+##嫁
+##嫂
+##嫉
+##嫌
+##嫑
+##嫔
+##嫖
+##嫘
+##嫚
+##嫡
+##嫣
+##嫦
+##嫩
+##嫲
+##嫵
+##嫻
+##嬅
+##嬉
+##嬌
+##嬗
+##嬛
+##嬢
+##嬤
+##嬪
+##嬰
+##嬴
+##嬷
+##嬸
+##嬿
+##孀
+##孃
+##子
+##孑
+##孔
+##孕
+##孖
+##字
+##存
+##孙
+##孚
+##孛
+##孜
+##孝
+##孟
+##孢
+##季
+##孤
+##学
+##孩
+##孪
+##孫
+##孬
+##孰
+##孱
+##孳
+##孵
+##學
+##孺
+##孽
+##孿
+##宁
+##它
+##宅
+##宇
+##守
+##安
+##宋
+##完
+##宏
+##宓
+##宕
+##宗
+##官
+##宙
+##定
+##宛
+##宜
+##宝
+##实
+##実
+##宠
+##审
+##客
+##宣
+##室
+##宥
+##宦
+##宪
+##宫
+##宮
+##宰
+##害
+##宴
+##宵
+##家
+##宸
+##容
+##宽
+##宾
+##宿
+##寂
+##寄
+##寅
+##密
+##寇
+##富
+##寐
+##寒
+##寓
+##寛
+##寝
+##寞
+##察
+##寡
+##寢
+##寥
+##實
+##寧
+##寨
+##審
+##寫
+##寬
+##寮
+##寰
+##寵
+##寶
+##寸
+##对
+##寺
+##寻
+##导
+##対
+##寿
+##封
+##専
+##射
+##将
+##將
+##專
+##尉
+##尊
+##尋
+##對
+##導
+##小
+##少
+##尔
+##尕
+##尖
+##尘
+##尚
+##尝
+##尤
+##尧
+##尬
+##就
+##尴
+##尷
+##尸
+##尹
+##尺
+##尻
+##尼
+##尽
+##尾
+##尿
+##局
+##屁
+##层
+##屄
+##居
+##屆
+##屈
+##屉
+##届
+##屋
+##屌
+##屍
+##屎
+##屏
+##屐
+##屑
+##展
+##屜
+##属
+##屠
+##屡
+##屢
+##層
+##履
+##屬
+##屯
+##山
+##屹
+##屿
+##岀
+##岁
+##岂
+##岌
+##岐
+##岑
+##岔
+##岖
+##岗
+##岘
+##岙
+##岚
+##岛
+##岡
+##岩
+##岫
+##岬
+##岭
+##岱
+##岳
+##岷
+##岸
+##峇
+##峋
+##峒
+##峙
+##峡
+##峤
+##峥
+##峦
+##峨
+##峪
+##峭
+##峯
+##峰
+##峴
+##島
+##峻
+##峽
+##崁
+##崂
+##崆
+##崇
+##崎
+##崑
+##崔
+##崖
+##崗
+##崙
+##崛
+##崧
+##崩
+##崭
+##崴
+##崽
+##嵇
+##嵊
+##嵋
+##嵌
+##嵐
+##嵘
+##嵩
+##嵬
+##嵯
+##嶂
+##嶄
+##嶇
+##嶋
+##嶙
+##嶺
+##嶼
+##嶽
+##巅
+##巍
+##巒
+##巔
+##巖
+##川
+##州
+##巡
+##巢
+##工
+##左
+##巧
+##巨
+##巩
+##巫
+##差
+##己
+##已
+##巳
+##巴
+##巷
+##巻
+##巽
+##巾
+##巿
+##币
+##市
+##布
+##帅
+##帆
+##师
+##希
+##帐
+##帑
+##帕
+##帖
+##帘
+##帚
+##帛
+##帜
+##帝
+##帥
+##带
+##帧
+##師
+##席
+##帮
+##帯
+##帰
+##帳
+##帶
+##帷
+##常
+##帼
+##帽
+##幀
+##幂
+##幄
+##幅
+##幌
+##幔
+##幕
+##幟
+##幡
+##幢
+##幣
+##幫
+##干
+##平
+##年
+##并
+##幸
+##幹
+##幺
+##幻
+##幼
+##幽
+##幾
+##广
+##庁
+##広
+##庄
+##庆
+##庇
+##床
+##序
+##庐
+##库
+##应
+##底
+##庖
+##店
+##庙
+##庚
+##府
+##庞
+##废
+##庠
+##度
+##座
+##庫
+##庭
+##庵
+##庶
+##康
+##庸
+##庹
+##庾
+##廁
+##廂
+##廃
+##廈
+##廉
+##廊
+##廓
+##廖
+##廚
+##廝
+##廟
+##廠
+##廢
+##廣
+##廬
+##廳
+##延
+##廷
+##建
+##廿
+##开
+##弁
+##异
+##弃
+##弄
+##弈
+##弊
+##弋
+##式
+##弑
+##弒
+##弓
+##弔
+##引
+##弗
+##弘
+##弛
+##弟
+##张
+##弥
+##弦
+##弧
+##弩
+##弭
+##弯
+##弱
+##張
+##強
+##弹
+##强
+##弼
+##弾
+##彅
+##彆
+##彈
+##彌
+##彎
+##归
+##当
+##录
+##彗
+##彙
+##彝
+##形
+##彤
+##彥
+##彦
+##彧
+##彩
+##彪
+##彫
+##彬
+##彭
+##彰
+##影
+##彷
+##役
+##彻
+##彼
+##彿
+##往
+##征
+##径
+##待
+##徇
+##很
+##徉
+##徊
+##律
+##後
+##徐
+##徑
+##徒
+##従
+##徕
+##得
+##徘
+##徙
+##徜
+##從
+##徠
+##御
+##徨
+##復
+##循
+##徬
+##微
+##徳
+##徴
+##徵
+##德
+##徹
+##徼
+##徽
+##心
+##必
+##忆
+##忌
+##忍
+##忏
+##忐
+##忑
+##忒
+##忖
+##志
+##忘
+##忙
+##応
+##忠
+##忡
+##忤
+##忧
+##忪
+##快
+##忱
+##念
+##忻
+##忽
+##忿
+##怀
+##态
+##怂
+##怅
+##怆
+##怎
+##怏
+##怒
+##怔
+##怕
+##怖
+##怙
+##怜
+##思
+##怠
+##怡
+##急
+##怦
+##性
+##怨
+##怪
+##怯
+##怵
+##总
+##怼
+##恁
+##恃
+##恆
+##恋
+##恍
+##恐
+##恒
+##恕
+##恙
+##恚
+##恢
+##恣
+##恤
+##恥
+##恨
+##恩
+##恪
+##恫
+##恬
+##恭
+##息
+##恰
+##恳
+##恵
+##恶
+##恸
+##恺
+##恻
+##恼
+##恿
+##悄
+##悅
+##悉
+##悌
+##悍
+##悔
+##悖
+##悚
+##悟
+##悠
+##患
+##悦
+##您
+##悩
+##悪
+##悬
+##悯
+##悱
+##悲
+##悴
+##悵
+##悶
+##悸
+##悻
+##悼
+##悽
+##情
+##惆
+##惇
+##惊
+##惋
+##惑
+##惕
+##惘
+##惚
+##惜
+##惟
+##惠
+##惡
+##惦
+##惧
+##惨
+##惩
+##惫
+##惬
+##惭
+##惮
+##惯
+##惰
+##惱
+##想
+##惴
+##惶
+##惹
+##惺
+##愁
+##愆
+##愈
+##愉
+##愍
+##意
+##愕
+##愚
+##愛
+##愜
+##感
+##愣
+##愤
+##愧
+##愫
+##愷
+##愿
+##慄
+##慈
+##態
+##慌
+##慎
+##慑
+##慕
+##慘
+##慚
+##慟
+##慢
+##慣
+##慧
+##慨
+##慫
+##慮
+##慰
+##慳
+##慵
+##慶
+##慷
+##慾
+##憂
+##憊
+##憋
+##憎
+##憐
+##憑
+##憔
+##憚
+##憤
+##憧
+##憨
+##憩
+##憫
+##憬
+##憲
+##憶
+##憾
+##懂
+##懇
+##懈
+##應
+##懊
+##懋
+##懑
+##懒
+##懦
+##懲
+##懵
+##懶
+##懷
+##懸
+##懺
+##懼
+##懾
+##懿
+##戀
+##戈
+##戊
+##戌
+##戍
+##戎
+##戏
+##成
+##我
+##戒
+##戕
+##或
+##战
+##戚
+##戛
+##戟
+##戡
+##戦
+##截
+##戬
+##戮
+##戰
+##戲
+##戳
+##戴
+##戶
+##户
+##戸
+##戻
+##戾
+##房
+##所
+##扁
+##扇
+##扈
+##扉
+##手
+##才
+##扎
+##扑
+##扒
+##打
+##扔
+##払
+##托
+##扛
+##扣
+##扦
+##执
+##扩
+##扪
+##扫
+##扬
+##扭
+##扮
+##扯
+##扰
+##扱
+##扳
+##扶
+##批
+##扼
+##找
+##承
+##技
+##抄
+##抉
+##把
+##抑
+##抒
+##抓
+##投
+##抖
+##抗
+##折
+##抚
+##抛
+##抜
+##択
+##抟
+##抠
+##抡
+##抢
+##护
+##报
+##抨
+##披
+##抬
+##抱
+##抵
+##抹
+##押
+##抽
+##抿
+##拂
+##拄
+##担
+##拆
+##拇
+##拈
+##拉
+##拋
+##拌
+##拍
+##拎
+##拐
+##拒
+##拓
+##拔
+##拖
+##拗
+##拘
+##拙
+##拚
+##招
+##拜
+##拟
+##拡
+##拢
+##拣
+##拥
+##拦
+##拧
+##拨
+##择
+##括
+##拭
+##拮
+##拯
+##拱
+##拳
+##拴
+##拷
+##拼
+##拽
+##拾
+##拿
+##持
+##挂
+##指
+##挈
+##按
+##挎
+##挑
+##挖
+##挙
+##挚
+##挛
+##挝
+##挞
+##挟
+##挠
+##挡
+##挣
+##挤
+##挥
+##挨
+##挪
+##挫
+##振
+##挲
+##挹
+##挺
+##挽
+##挾
+##捂
+##捅
+##捆
+##捉
+##捋
+##捌
+##捍
+##捎
+##捏
+##捐
+##捕
+##捞
+##损
+##捡
+##换
+##捣
+##捧
+##捨
+##捩
+##据
+##捱
+##捲
+##捶
+##捷
+##捺
+##捻
+##掀
+##掂
+##掃
+##掇
+##授
+##掉
+##掌
+##掏
+##掐
+##排
+##掖
+##掘
+##掙
+##掛
+##掠
+##採
+##探
+##掣
+##接
+##控
+##推
+##掩
+##措
+##掬
+##掰
+##掲
+##掳
+##掴
+##掷
+##掸
+##掺
+##揀
+##揃
+##揄
+##揆
+##揉
+##揍
+##描
+##提
+##插
+##揖
+##揚
+##換
+##握
+##揣
+##揩
+##揪
+##揭
+##揮
+##援
+##揶
+##揸
+##揹
+##揽
+##搀
+##搁
+##搂
+##搅
+##損
+##搏
+##搐
+##搓
+##搔
+##搖
+##搗
+##搜
+##搞
+##搡
+##搪
+##搬
+##搭
+##搵
+##搶
+##携
+##搽
+##摀
+##摁
+##摄
+##摆
+##摇
+##摈
+##摊
+##摒
+##摔
+##摘
+##摞
+##摟
+##摧
+##摩
+##摯
+##摳
+##摸
+##摹
+##摺
+##摻
+##撂
+##撃
+##撅
+##撇
+##撈
+##撐
+##撑
+##撒
+##撓
+##撕
+##撚
+##撞
+##撤
+##撥
+##撩
+##撫
+##撬
+##播
+##撮
+##撰
+##撲
+##撵
+##撷
+##撸
+##撻
+##撼
+##撿
+##擀
+##擁
+##擂
+##擄
+##擅
+##擇
+##擊
+##擋
+##操
+##擎
+##擒
+##擔
+##擘
+##據
+##擞
+##擠
+##擡
+##擢
+##擦
+##擬
+##擰
+##擱
+##擲
+##擴
+##擷
+##擺
+##擼
+##擾
+##攀
+##攏
+##攒
+##攔
+##攘
+##攙
+##攜
+##攝
+##攞
+##攢
+##攣
+##攤
+##攥
+##攪
+##攫
+##攬
+##支
+##收
+##攸
+##改
+##攻
+##放
+##政
+##故
+##效
+##敌
+##敍
+##敎
+##敏
+##救
+##敕
+##敖
+##敗
+##敘
+##教
+##敛
+##敝
+##敞
+##敢
+##散
+##敦
+##敬
+##数
+##敲
+##整
+##敵
+##敷
+##數
+##斂
+##斃
+##文
+##斋
+##斌
+##斎
+##斐
+##斑
+##斓
+##斗
+##料
+##斛
+##斜
+##斟
+##斡
+##斤
+##斥
+##斧
+##斩
+##斫
+##斬
+##断
+##斯
+##新
+##斷
+##方
+##於
+##施
+##旁
+##旃
+##旅
+##旋
+##旌
+##旎
+##族
+##旖
+##旗
+##无
+##既
+##日
+##旦
+##旧
+##旨
+##早
+##旬
+##旭
+##旮
+##旱
+##时
+##旷
+##旺
+##旻
+##昀
+##昂
+##昆
+##昇
+##昉
+##昊
+##昌
+##明
+##昏
+##易
+##昔
+##昕
+##昙
+##星
+##映
+##春
+##昧
+##昨
+##昭
+##是
+##昱
+##昴
+##昵
+##昶
+##昼
+##显
+##晁
+##時
+##晃
+##晉
+##晋
+##晌
+##晏
+##晒
+##晓
+##晔
+##晕
+##晖
+##晗
+##晚
+##晝
+##晞
+##晟
+##晤
+##晦
+##晨
+##晩
+##普
+##景
+##晰
+##晴
+##晶
+##晷
+##智
+##晾
+##暂
+##暄
+##暇
+##暈
+##暉
+##暌
+##暐
+##暑
+##暖
+##暗
+##暝
+##暢
+##暧
+##暨
+##暫
+##暮
+##暱
+##暴
+##暸
+##暹
+##曄
+##曆
+##曇
+##曉
+##曖
+##曙
+##曜
+##曝
+##曠
+##曦
+##曬
+##曰
+##曲
+##曳
+##更
+##書
+##曹
+##曼
+##曾
+##替
+##最
+##會
+##月
+##有
+##朋
+##服
+##朐
+##朔
+##朕
+##朗
+##望
+##朝
+##期
+##朦
+##朧
+##木
+##未
+##末
+##本
+##札
+##朮
+##术
+##朱
+##朴
+##朵
+##机
+##朽
+##杀
+##杂
+##权
+##杆
+##杈
+##杉
+##李
+##杏
+##材
+##村
+##杓
+##杖
+##杜
+##杞
+##束
+##杠
+##条
+##来
+##杨
+##杭
+##杯
+##杰
+##東
+##杳
+##杵
+##杷
+##杼
+##松
+##板
+##极
+##构
+##枇
+##枉
+##枋
+##析
+##枕
+##林
+##枚
+##果
+##枝
+##枢
+##枣
+##枪
+##枫
+##枭
+##枯
+##枰
+##枱
+##枳
+##架
+##枷
+##枸
+##柄
+##柏
+##某
+##柑
+##柒
+##染
+##柔
+##柘
+##柚
+##柜
+##柞
+##柠
+##柢
+##查
+##柩
+##柬
+##柯
+##柱
+##柳
+##柴
+##柵
+##査
+##柿
+##栀
+##栃
+##栄
+##栅
+##标
+##栈
+##栉
+##栋
+##栎
+##栏
+##树
+##栓
+##栖
+##栗
+##校
+##栩
+##株
+##样
+##核
+##根
+##格
+##栽
+##栾
+##桀
+##桁
+##桂
+##桃
+##桅
+##框
+##案
+##桉
+##桌
+##桎
+##桐
+##桑
+##桓
+##桔
+##桜
+##桠
+##桡
+##桢
+##档
+##桥
+##桦
+##桧
+##桨
+##桩
+##桶
+##桿
+##梁
+##梅
+##梆
+##梏
+##梓
+##梗
+##條
+##梟
+##梢
+##梦
+##梧
+##梨
+##梭
+##梯
+##械
+##梳
+##梵
+##梶
+##检
+##棂
+##棄
+##棉
+##棋
+##棍
+##棒
+##棕
+##棗
+##棘
+##棚
+##棟
+##棠
+##棣
+##棧
+##森
+##棱
+##棲
+##棵
+##棹
+##棺
+##椁
+##椅
+##椋
+##植
+##椎
+##椒
+##検
+##椪
+##椭
+##椰
+##椹
+##椽
+##椿
+##楂
+##楊
+##楓
+##楔
+##楚
+##楝
+##楞
+##楠
+##楣
+##楨
+##楫
+##業
+##楮
+##極
+##楷
+##楸
+##楹
+##楼
+##楽
+##概
+##榄
+##榆
+##榈
+##榉
+##榔
+##榕
+##榖
+##榛
+##榜
+##榨
+##榫
+##榭
+##榮
+##榱
+##榴
+##榷
+##榻
+##槁
+##槃
+##構
+##槌
+##槍
+##槎
+##槐
+##槓
+##様
+##槛
+##槟
+##槤
+##槭
+##槲
+##槳
+##槻
+##槽
+##槿
+##樁
+##樂
+##樊
+##樑
+##樓
+##標
+##樞
+##樟
+##模
+##樣
+##権
+##横
+##樫
+##樯
+##樱
+##樵
+##樸
+##樹
+##樺
+##樽
+##樾
+##橄
+##橇
+##橋
+##橐
+##橘
+##橙
+##機
+##橡
+##橢
+##橫
+##橱
+##橹
+##橼
+##檀
+##檄
+##檎
+##檐
+##檔
+##檗
+##檜
+##檢
+##檬
+##檯
+##檳
+##檸
+##檻
+##櫃
+##櫚
+##櫛
+##櫥
+##櫸
+##櫻
+##欄
+##權
+##欒
+##欖
+##欠
+##次
+##欢
+##欣
+##欧
+##欲
+##欸
+##欺
+##欽
+##款
+##歆
+##歇
+##歉
+##歌
+##歎
+##歐
+##歓
+##歙
+##歛
+##歡
+##止
+##正
+##此
+##步
+##武
+##歧
+##歩
+##歪
+##歯
+##歲
+##歳
+##歴
+##歷
+##歸
+##歹
+##死
+##歼
+##殁
+##殃
+##殆
+##殇
+##殉
+##殊
+##残
+##殒
+##殓
+##殖
+##殘
+##殞
+##殡
+##殤
+##殭
+##殯
+##殲
+##殴
+##段
+##殷
+##殺
+##殼
+##殿
+##毀
+##毁
+##毂
+##毅
+##毆
+##毋
+##母
+##毎
+##每
+##毒
+##毓
+##比
+##毕
+##毗
+##毘
+##毙
+##毛
+##毡
+##毫
+##毯
+##毽
+##氈
+##氏
+##氐
+##民
+##氓
+##气
+##氖
+##気
+##氙
+##氛
+##氟
+##氡
+##氢
+##氣
+##氤
+##氦
+##氧
+##氨
+##氪
+##氫
+##氮
+##氯
+##氰
+##氲
+##水
+##氷
+##永
+##氹
+##氾
+##汀
+##汁
+##求
+##汆
+##汇
+##汉
+##汎
+##汐
+##汕
+##汗
+##汙
+##汛
+##汝
+##汞
+##江
+##池
+##污
+##汤
+##汨
+##汩
+##汪
+##汰
+##汲
+##汴
+##汶
+##汹
+##決
+##汽
+##汾
+##沁
+##沂
+##沃
+##沅
+##沈
+##沉
+##沌
+##沏
+##沐
+##沒
+##沓
+##沖
+##沙
+##沛
+##沟
+##没
+##沢
+##沣
+##沥
+##沦
+##沧
+##沪
+##沫
+##沭
+##沮
+##沱
+##河
+##沸
+##油
+##治
+##沼
+##沽
+##沾
+##沿
+##況
+##泄
+##泉
+##泊
+##泌
+##泓
+##法
+##泗
+##泛
+##泞
+##泠
+##泡
+##波
+##泣
+##泥
+##注
+##泪
+##泫
+##泮
+##泯
+##泰
+##泱
+##泳
+##泵
+##泷
+##泸
+##泻
+##泼
+##泽
+##泾
+##洁
+##洄
+##洋
+##洒
+##洗
+##洙
+##洛
+##洞
+##津
+##洩
+##洪
+##洮
+##洱
+##洲
+##洵
+##洶
+##洸
+##洹
+##活
+##洼
+##洽
+##派
+##流
+##浃
+##浄
+##浅
+##浆
+##浇
+##浊
+##测
+##济
+##浏
+##浑
+##浒
+##浓
+##浔
+##浙
+##浚
+##浜
+##浣
+##浦
+##浩
+##浪
+##浬
+##浮
+##浯
+##浴
+##海
+##浸
+##涂
+##涅
+##涇
+##消
+##涉
+##涌
+##涎
+##涓
+##涔
+##涕
+##涙
+##涛
+##涝
+##涞
+##涟
+##涠
+##涡
+##涣
+##涤
+##润
+##涧
+##涨
+##涩
+##涪
+##涮
+##涯
+##液
+##涵
+##涸
+##涼
+##涿
+##淀
+##淄
+##淅
+##淆
+##淇
+##淋
+##淌
+##淑
+##淒
+##淖
+##淘
+##淙
+##淚
+##淞
+##淡
+##淤
+##淦
+##淨
+##淩
+##淪
+##淫
+##淬
+##淮
+##深
+##淳
+##淵
+##混
+##淹
+##淺
+##添
+##淼
+##清
+##済
+##渉
+##渊
+##渋
+##渍
+##渎
+##渐
+##渔
+##渗
+##渙
+##渚
+##減
+##渝
+##渠
+##渡
+##渣
+##渤
+##渥
+##渦
+##温
+##測
+##渭
+##港
+##渲
+##渴
+##游
+##渺
+##渾
+##湃
+##湄
+##湊
+##湍
+##湖
+##湘
+##湛
+##湟
+##湧
+##湫
+##湮
+##湯
+##湳
+##湾
+##湿
+##満
+##溃
+##溅
+##溉
+##溏
+##源
+##準
+##溜
+##溝
+##溟
+##溢
+##溥
+##溧
+##溪
+##溫
+##溯
+##溱
+##溴
+##溶
+##溺
+##溼
+##滁
+##滂
+##滄
+##滅
+##滇
+##滋
+##滌
+##滑
+##滓
+##滔
+##滕
+##滙
+##滚
+##滝
+##滞
+##滟
+##满
+##滢
+##滤
+##滥
+##滦
+##滨
+##滩
+##滬
+##滯
+##滲
+##滴
+##滷
+##滸
+##滾
+##滿
+##漁
+##漂
+##漆
+##漉
+##漏
+##漓
+##演
+##漕
+##漠
+##漢
+##漣
+##漩
+##漪
+##漫
+##漬
+##漯
+##漱
+##漲
+##漳
+##漸
+##漾
+##漿
+##潆
+##潇
+##潋
+##潍
+##潑
+##潔
+##潘
+##潛
+##潜
+##潞
+##潟
+##潢
+##潤
+##潦
+##潧
+##潭
+##潮
+##潰
+##潴
+##潸
+##潺
+##潼
+##澀
+##澄
+##澆
+##澈
+##澍
+##澎
+##澗
+##澜
+##澡
+##澤
+##澧
+##澱
+##澳
+##澹
+##激
+##濁
+##濂
+##濃
+##濑
+##濒
+##濕
+##濘
+##濛
+##濟
+##濠
+##濡
+##濤
+##濫
+##濬
+##濮
+##濯
+##濱
+##濺
+##濾
+##瀅
+##瀆
+##瀉
+##瀋
+##瀏
+##瀑
+##瀕
+##瀘
+##瀚
+##瀛
+##瀝
+##瀞
+##瀟
+##瀧
+##瀨
+##瀬
+##瀰
+##瀾
+##灌
+##灏
+##灑
+##灘
+##灝
+##灞
+##灣
+##火
+##灬
+##灭
+##灯
+##灰
+##灵
+##灶
+##灸
+##灼
+##災
+##灾
+##灿
+##炀
+##炁
+##炅
+##炉
+##炊
+##炎
+##炒
+##炔
+##炕
+##炖
+##炙
+##炜
+##炫
+##炬
+##炭
+##炮
+##炯
+##炳
+##炷
+##炸
+##点
+##為
+##炼
+##炽
+##烁
+##烂
+##烃
+##烈
+##烊
+##烏
+##烘
+##烙
+##烛
+##烟
+##烤
+##烦
+##烧
+##烨
+##烩
+##烫
+##烬
+##热
+##烯
+##烷
+##烹
+##烽
+##焉
+##焊
+##焕
+##焖
+##焗
+##焘
+##焙
+##焚
+##焜
+##無
+##焦
+##焯
+##焰
+##焱
+##然
+##焼
+##煅
+##煉
+##煊
+##煌
+##煎
+##煒
+##煖
+##煙
+##煜
+##煞
+##煤
+##煥
+##煦
+##照
+##煨
+##煩
+##煮
+##煲
+##煸
+##煽
+##熄
+##熊
+##熏
+##熒
+##熔
+##熙
+##熟
+##熠
+##熨
+##熬
+##熱
+##熵
+##熹
+##熾
+##燁
+##燃
+##燄
+##燈
+##燉
+##燊
+##燎
+##燒
+##燔
+##燕
+##燙
+##燜
+##營
+##燥
+##燦
+##燧
+##燭
+##燮
+##燴
+##燻
+##燼
+##燿
+##爆
+##爍
+##爐
+##爛
+##爪
+##爬
+##爭
+##爰
+##爱
+##爲
+##爵
+##父
+##爷
+##爸
+##爹
+##爺
+##爻
+##爽
+##爾
+##牆
+##片
+##版
+##牌
+##牍
+##牒
+##牙
+##牛
+##牝
+##牟
+##牠
+##牡
+##牢
+##牦
+##牧
+##物
+##牯
+##牲
+##牴
+##牵
+##特
+##牺
+##牽
+##犀
+##犁
+##犄
+##犊
+##犍
+##犒
+##犢
+##犧
+##犬
+##犯
+##状
+##犷
+##犸
+##犹
+##狀
+##狂
+##狄
+##狈
+##狎
+##狐
+##狒
+##狗
+##狙
+##狞
+##狠
+##狡
+##狩
+##独
+##狭
+##狮
+##狰
+##狱
+##狸
+##狹
+##狼
+##狽
+##猎
+##猕
+##猖
+##猗
+##猙
+##猛
+##猜
+##猝
+##猥
+##猩
+##猪
+##猫
+##猬
+##献
+##猴
+##猶
+##猷
+##猾
+##猿
+##獄
+##獅
+##獎
+##獐
+##獒
+##獗
+##獠
+##獣
+##獨
+##獭
+##獰
+##獲
+##獵
+##獷
+##獸
+##獺
+##獻
+##獼
+##獾
+##玄
+##率
+##玉
+##王
+##玑
+##玖
+##玛
+##玟
+##玠
+##玥
+##玩
+##玫
+##玮
+##环
+##现
+##玲
+##玳
+##玷
+##玺
+##玻
+##珀
+##珂
+##珅
+##珈
+##珉
+##珊
+##珍
+##珏
+##珐
+##珑
+##珙
+##珞
+##珠
+##珣
+##珥
+##珩
+##珪
+##班
+##珮
+##珲
+##珺
+##現
+##球
+##琅
+##理
+##琇
+##琉
+##琊
+##琍
+##琏
+##琐
+##琛
+##琢
+##琥
+##琦
+##琨
+##琪
+##琬
+##琮
+##琰
+##琲
+##琳
+##琴
+##琵
+##琶
+##琺
+##琼
+##瑀
+##瑁
+##瑄
+##瑋
+##瑕
+##瑗
+##瑙
+##瑚
+##瑛
+##瑜
+##瑞
+##瑟
+##瑠
+##瑣
+##瑤
+##瑩
+##瑪
+##瑯
+##瑰
+##瑶
+##瑾
+##璀
+##璁
+##璃
+##璇
+##璉
+##璋
+##璎
+##璐
+##璜
+##璞
+##璟
+##璧
+##璨
+##環
+##璽
+##璿
+##瓊
+##瓏
+##瓒
+##瓜
+##瓢
+##瓣
+##瓤
+##瓦
+##瓮
+##瓯
+##瓴
+##瓶
+##瓷
+##甄
+##甌
+##甕
+##甘
+##甙
+##甚
+##甜
+##生
+##產
+##産
+##甥
+##甦
+##用
+##甩
+##甫
+##甬
+##甭
+##甯
+##田
+##由
+##甲
+##申
+##电
+##男
+##甸
+##町
+##画
+##甾
+##畀
+##畅
+##界
+##畏
+##畑
+##畔
+##留
+##畜
+##畝
+##畢
+##略
+##畦
+##番
+##畫
+##異
+##畲
+##畳
+##畴
+##當
+##畸
+##畹
+##畿
+##疆
+##疇
+##疊
+##疏
+##疑
+##疔
+##疖
+##疗
+##疙
+##疚
+##疝
+##疟
+##疡
+##疣
+##疤
+##疥
+##疫
+##疮
+##疯
+##疱
+##疲
+##疳
+##疵
+##疸
+##疹
+##疼
+##疽
+##疾
+##痂
+##病
+##症
+##痈
+##痉
+##痊
+##痍
+##痒
+##痔
+##痕
+##痘
+##痙
+##痛
+##痞
+##痠
+##痢
+##痣
+##痤
+##痧
+##痨
+##痪
+##痫
+##痰
+##痱
+##痴
+##痹
+##痺
+##痼
+##痿
+##瘀
+##瘁
+##瘋
+##瘍
+##瘓
+##瘘
+##瘙
+##瘟
+##瘠
+##瘡
+##瘢
+##瘤
+##瘦
+##瘧
+##瘩
+##瘪
+##瘫
+##瘴
+##瘸
+##瘾
+##療
+##癇
+##癌
+##癒
+##癖
+##癜
+##癞
+##癡
+##癢
+##癣
+##癥
+##癫
+##癬
+##癮
+##癱
+##癲
+##癸
+##発
+##登
+##發
+##白
+##百
+##皂
+##的
+##皆
+##皇
+##皈
+##皋
+##皎
+##皑
+##皓
+##皖
+##皙
+##皚
+##皮
+##皰
+##皱
+##皴
+##皺
+##皿
+##盂
+##盃
+##盅
+##盆
+##盈
+##益
+##盎
+##盏
+##盐
+##监
+##盒
+##盔
+##盖
+##盗
+##盘
+##盛
+##盜
+##盞
+##盟
+##盡
+##監
+##盤
+##盥
+##盧
+##盪
+##目
+##盯
+##盱
+##盲
+##直
+##相
+##盹
+##盼
+##盾
+##省
+##眈
+##眉
+##看
+##県
+##眙
+##眞
+##真
+##眠
+##眦
+##眨
+##眩
+##眯
+##眶
+##眷
+##眸
+##眺
+##眼
+##眾
+##着
+##睁
+##睇
+##睏
+##睐
+##睑
+##睛
+##睜
+##睞
+##睡
+##睢
+##督
+##睥
+##睦
+##睨
+##睪
+##睫
+##睬
+##睹
+##睽
+##睾
+##睿
+##瞄
+##瞅
+##瞇
+##瞋
+##瞌
+##瞎
+##瞑
+##瞒
+##瞓
+##瞞
+##瞟
+##瞠
+##瞥
+##瞧
+##瞩
+##瞪
+##瞬
+##瞭
+##瞰
+##瞳
+##瞻
+##瞼
+##瞿
+##矇
+##矍
+##矗
+##矚
+##矛
+##矜
+##矢
+##矣
+##知
+##矩
+##矫
+##短
+##矮
+##矯
+##石
+##矶
+##矽
+##矾
+##矿
+##码
+##砂
+##砌
+##砍
+##砒
+##研
+##砖
+##砗
+##砚
+##砝
+##砣
+##砥
+##砧
+##砭
+##砰
+##砲
+##破
+##砷
+##砸
+##砺
+##砼
+##砾
+##础
+##硅
+##硐
+##硒
+##硕
+##硝
+##硫
+##硬
+##确
+##硯
+##硼
+##碁
+##碇
+##碉
+##碌
+##碍
+##碎
+##碑
+##碓
+##碗
+##碘
+##碚
+##碛
+##碟
+##碣
+##碧
+##碩
+##碰
+##碱
+##碳
+##碴
+##確
+##碼
+##碾
+##磁
+##磅
+##磊
+##磋
+##磐
+##磕
+##磚
+##磡
+##磨
+##磬
+##磯
+##磲
+##磷
+##磺
+##礁
+##礎
+##礙
+##礡
+##礦
+##礪
+##礫
+##礴
+##示
+##礼
+##社
+##祀
+##祁
+##祂
+##祇
+##祈
+##祉
+##祎
+##祐
+##祕
+##祖
+##祗
+##祚
+##祛
+##祜
+##祝
+##神
+##祟
+##祠
+##祢
+##祥
+##票
+##祭
+##祯
+##祷
+##祸
+##祺
+##祿
+##禀
+##禁
+##禄
+##禅
+##禍
+##禎
+##福
+##禛
+##禦
+##禧
+##禪
+##禮
+##禱
+##禹
+##禺
+##离
+##禽
+##禾
+##禿
+##秀
+##私
+##秃
+##秆
+##秉
+##秋
+##种
+##科
+##秒
+##秘
+##租
+##秣
+##秤
+##秦
+##秧
+##秩
+##秭
+##积
+##称
+##秸
+##移
+##秽
+##稀
+##稅
+##程
+##稍
+##税
+##稔
+##稗
+##稚
+##稜
+##稞
+##稟
+##稠
+##稣
+##種
+##稱
+##稲
+##稳
+##稷
+##稹
+##稻
+##稼
+##稽
+##稿
+##穀
+##穂
+##穆
+##穌
+##積
+##穎
+##穗
+##穢
+##穩
+##穫
+##穴
+##究
+##穷
+##穹
+##空
+##穿
+##突
+##窃
+##窄
+##窈
+##窍
+##窑
+##窒
+##窓
+##窕
+##窖
+##窗
+##窘
+##窜
+##窝
+##窟
+##窠
+##窥
+##窦
+##窨
+##窩
+##窪
+##窮
+##窯
+##窺
+##窿
+##竄
+##竅
+##竇
+##竊
+##立
+##竖
+##站
+##竜
+##竞
+##竟
+##章
+##竣
+##童
+##竭
+##端
+##競
+##竹
+##竺
+##竽
+##竿
+##笃
+##笆
+##笈
+##笋
+##笏
+##笑
+##笔
+##笙
+##笛
+##笞
+##笠
+##符
+##笨
+##第
+##笹
+##笺
+##笼
+##筆
+##等
+##筊
+##筋
+##筍
+##筏
+##筐
+##筑
+##筒
+##答
+##策
+##筛
+##筝
+##筠
+##筱
+##筲
+##筵
+##筷
+##筹
+##签
+##简
+##箇
+##箋
+##箍
+##箏
+##箐
+##箔
+##箕
+##算
+##箝
+##管
+##箩
+##箫
+##箭
+##箱
+##箴
+##箸
+##節
+##篁
+##範
+##篆
+##篇
+##築
+##篑
+##篓
+##篙
+##篝
+##篠
+##篡
+##篤
+##篩
+##篪
+##篮
+##篱
+##篷
+##簇
+##簌
+##簍
+##簡
+##簦
+##簧
+##簪
+##簫
+##簷
+##簸
+##簽
+##簾
+##簿
+##籁
+##籃
+##籌
+##籍
+##籐
+##籟
+##籠
+##籤
+##籬
+##籮
+##籲
+##米
+##类
+##籼
+##籽
+##粄
+##粉
+##粑
+##粒
+##粕
+##粗
+##粘
+##粟
+##粤
+##粥
+##粧
+##粪
+##粮
+##粱
+##粲
+##粳
+##粵
+##粹
+##粼
+##粽
+##精
+##粿
+##糅
+##糊
+##糍
+##糕
+##糖
+##糗
+##糙
+##糜
+##糞
+##糟
+##糠
+##糧
+##糬
+##糯
+##糰
+##糸
+##系
+##糾
+##紀
+##紂
+##約
+##紅
+##紉
+##紊
+##紋
+##納
+##紐
+##紓
+##純
+##紗
+##紘
+##紙
+##級
+##紛
+##紜
+##素
+##紡
+##索
+##紧
+##紫
+##紮
+##累
+##細
+##紳
+##紹
+##紺
+##終
+##絃
+##組
+##絆
+##経
+##結
+##絕
+##絞
+##絡
+##絢
+##給
+##絨
+##絮
+##統
+##絲
+##絳
+##絵
+##絶
+##絹
+##綁
+##綏
+##綑
+##經
+##継
+##続
+##綜
+##綠
+##綢
+##綦
+##綫
+##綬
+##維
+##綱
+##網
+##綴
+##綵
+##綸
+##綺
+##綻
+##綽
+##綾
+##綿
+##緊
+##緋
+##総
+##緑
+##緒
+##緘
+##線
+##緝
+##緞
+##締
+##緣
+##編
+##緩
+##緬
+##緯
+##練
+##緹
+##緻
+##縁
+##縄
+##縈
+##縛
+##縝
+##縣
+##縫
+##縮
+##縱
+##縴
+##縷
+##總
+##績
+##繁
+##繃
+##繆
+##繇
+##繋
+##織
+##繕
+##繚
+##繞
+##繡
+##繩
+##繪
+##繫
+##繭
+##繳
+##繹
+##繼
+##繽
+##纂
+##續
+##纍
+##纏
+##纓
+##纔
+##纖
+##纜
+##纠
+##红
+##纣
+##纤
+##约
+##级
+##纨
+##纪
+##纫
+##纬
+##纭
+##纯
+##纰
+##纱
+##纲
+##纳
+##纵
+##纶
+##纷
+##纸
+##纹
+##纺
+##纽
+##纾
+##线
+##绀
+##练
+##组
+##绅
+##细
+##织
+##终
+##绊
+##绍
+##绎
+##经
+##绑
+##绒
+##结
+##绔
+##绕
+##绘
+##给
+##绚
+##绛
+##络
+##绝
+##绞
+##统
+##绡
+##绢
+##绣
+##绥
+##绦
+##继
+##绩
+##绪
+##绫
+##续
+##绮
+##绯
+##绰
+##绳
+##维
+##绵
+##绶
+##绷
+##绸
+##绻
+##综
+##绽
+##绾
+##绿
+##缀
+##缄
+##缅
+##缆
+##缇
+##缈
+##缉
+##缎
+##缓
+##缔
+##缕
+##编
+##缘
+##缙
+##缚
+##缜
+##缝
+##缠
+##缢
+##缤
+##缥
+##缨
+##缩
+##缪
+##缭
+##缮
+##缰
+##缱
+##缴
+##缸
+##缺
+##缽
+##罂
+##罄
+##罌
+##罐
+##网
+##罔
+##罕
+##罗
+##罚
+##罡
+##罢
+##罩
+##罪
+##置
+##罰
+##署
+##罵
+##罷
+##罹
+##羁
+##羅
+##羈
+##羊
+##羌
+##美
+##羔
+##羚
+##羞
+##羟
+##羡
+##羣
+##群
+##羥
+##羧
+##羨
+##義
+##羯
+##羲
+##羸
+##羹
+##羽
+##羿
+##翁
+##翅
+##翊
+##翌
+##翎
+##習
+##翔
+##翘
+##翟
+##翠
+##翡
+##翦
+##翩
+##翰
+##翱
+##翳
+##翹
+##翻
+##翼
+##耀
+##老
+##考
+##耄
+##者
+##耆
+##耋
+##而
+##耍
+##耐
+##耒
+##耕
+##耗
+##耘
+##耙
+##耦
+##耨
+##耳
+##耶
+##耷
+##耸
+##耻
+##耽
+##耿
+##聂
+##聆
+##聊
+##聋
+##职
+##聒
+##联
+##聖
+##聘
+##聚
+##聞
+##聪
+##聯
+##聰
+##聲
+##聳
+##聴
+##聶
+##職
+##聽
+##聾
+##聿
+##肃
+##肄
+##肅
+##肆
+##肇
+##肉
+##肋
+##肌
+##肏
+##肓
+##肖
+##肘
+##肚
+##肛
+##肝
+##肠
+##股
+##肢
+##肤
+##肥
+##肩
+##肪
+##肮
+##肯
+##肱
+##育
+##肴
+##肺
+##肽
+##肾
+##肿
+##胀
+##胁
+##胃
+##胄
+##胆
+##背
+##胍
+##胎
+##胖
+##胚
+##胛
+##胜
+##胝
+##胞
+##胡
+##胤
+##胥
+##胧
+##胫
+##胭
+##胯
+##胰
+##胱
+##胳
+##胴
+##胶
+##胸
+##胺
+##能
+##脂
+##脅
+##脆
+##脇
+##脈
+##脉
+##脊
+##脍
+##脏
+##脐
+##脑
+##脓
+##脖
+##脘
+##脚
+##脛
+##脣
+##脩
+##脫
+##脯
+##脱
+##脲
+##脳
+##脸
+##脹
+##脾
+##腆
+##腈
+##腊
+##腋
+##腌
+##腎
+##腐
+##腑
+##腓
+##腔
+##腕
+##腥
+##腦
+##腩
+##腫
+##腭
+##腮
+##腰
+##腱
+##腳
+##腴
+##腸
+##腹
+##腺
+##腻
+##腼
+##腾
+##腿
+##膀
+##膈
+##膊
+##膏
+##膑
+##膘
+##膚
+##膛
+##膜
+##膝
+##膠
+##膦
+##膨
+##膩
+##膳
+##膺
+##膻
+##膽
+##膾
+##膿
+##臀
+##臂
+##臃
+##臆
+##臉
+##臊
+##臍
+##臓
+##臘
+##臟
+##臣
+##臥
+##臧
+##臨
+##自
+##臬
+##臭
+##至
+##致
+##臺
+##臻
+##臼
+##臾
+##舀
+##舂
+##舅
+##舆
+##與
+##興
+##舉
+##舊
+##舌
+##舍
+##舎
+##舐
+##舒
+##舔
+##舖
+##舗
+##舛
+##舜
+##舞
+##舟
+##航
+##舫
+##般
+##舰
+##舱
+##舵
+##舶
+##舷
+##舸
+##船
+##舺
+##舾
+##艇
+##艋
+##艘
+##艙
+##艦
+##艮
+##良
+##艰
+##艱
+##色
+##艳
+##艷
+##艹
+##艺
+##艾
+##节
+##芃
+##芈
+##芊
+##芋
+##芍
+##芎
+##芒
+##芙
+##芜
+##芝
+##芡
+##芥
+##芦
+##芩
+##芪
+##芫
+##芬
+##芭
+##芮
+##芯
+##花
+##芳
+##芷
+##芸
+##芹
+##芻
+##芽
+##芾
+##苁
+##苄
+##苇
+##苋
+##苍
+##苏
+##苑
+##苒
+##苓
+##苔
+##苕
+##苗
+##苛
+##苜
+##苞
+##苟
+##苡
+##苣
+##若
+##苦
+##苫
+##苯
+##英
+##苷
+##苹
+##苻
+##茁
+##茂
+##范
+##茄
+##茅
+##茉
+##茎
+##茏
+##茗
+##茜
+##茧
+##茨
+##茫
+##茬
+##茭
+##茯
+##茱
+##茲
+##茴
+##茵
+##茶
+##茸
+##茹
+##茼
+##荀
+##荃
+##荆
+##草
+##荊
+##荏
+##荐
+##荒
+##荔
+##荖
+##荘
+##荚
+##荞
+##荟
+##荠
+##荡
+##荣
+##荤
+##荥
+##荧
+##荨
+##荪
+##荫
+##药
+##荳
+##荷
+##荸
+##荻
+##荼
+##荽
+##莅
+##莆
+##莉
+##莊
+##莎
+##莒
+##莓
+##莖
+##莘
+##莞
+##莠
+##莢
+##莧
+##莪
+##莫
+##莱
+##莲
+##莴
+##获
+##莹
+##莺
+##莽
+##莿
+##菀
+##菁
+##菅
+##菇
+##菈
+##菊
+##菌
+##菏
+##菓
+##菖
+##菘
+##菜
+##菟
+##菠
+##菡
+##菩
+##華
+##菱
+##菲
+##菸
+##菽
+##萁
+##萃
+##萄
+##萊
+##萋
+##萌
+##萍
+##萎
+##萘
+##萝
+##萤
+##营
+##萦
+##萧
+##萨
+##萩
+##萬
+##萱
+##萵
+##萸
+##萼
+##落
+##葆
+##葉
+##著
+##葚
+##葛
+##葡
+##董
+##葦
+##葩
+##葫
+##葬
+##葭
+##葯
+##葱
+##葳
+##葵
+##葷
+##葺
+##蒂
+##蒋
+##蒐
+##蒔
+##蒙
+##蒜
+##蒞
+##蒟
+##蒡
+##蒨
+##蒲
+##蒸
+##蒹
+##蒻
+##蒼
+##蒿
+##蓁
+##蓄
+##蓆
+##蓉
+##蓋
+##蓑
+##蓓
+##蓖
+##蓝
+##蓟
+##蓦
+##蓬
+##蓮
+##蓼
+##蓿
+##蔑
+##蔓
+##蔔
+##蔗
+##蔘
+##蔚
+##蔡
+##蔣
+##蔥
+##蔫
+##蔬
+##蔭
+##蔵
+##蔷
+##蔺
+##蔻
+##蔼
+##蔽
+##蕁
+##蕃
+##蕈
+##蕉
+##蕊
+##蕎
+##蕙
+##蕤
+##蕨
+##蕩
+##蕪
+##蕭
+##蕲
+##蕴
+##蕻
+##蕾
+##薄
+##薅
+##薇
+##薈
+##薊
+##薏
+##薑
+##薔
+##薙
+##薛
+##薦
+##薨
+##薩
+##薪
+##薬
+##薯
+##薰
+##薹
+##藉
+##藍
+##藏
+##藐
+##藓
+##藕
+##藜
+##藝
+##藤
+##藥
+##藩
+##藹
+##藻
+##藿
+##蘆
+##蘇
+##蘊
+##蘋
+##蘑
+##蘚
+##蘭
+##蘸
+##蘼
+##蘿
+##虎
+##虏
+##虐
+##虑
+##虔
+##處
+##虚
+##虛
+##虜
+##虞
+##號
+##虢
+##虧
+##虫
+##虬
+##虱
+##虹
+##虻
+##虽
+##虾
+##蚀
+##蚁
+##蚂
+##蚊
+##蚌
+##蚓
+##蚕
+##蚜
+##蚝
+##蚣
+##蚤
+##蚩
+##蚪
+##蚯
+##蚱
+##蚵
+##蛀
+##蛆
+##蛇
+##蛊
+##蛋
+##蛎
+##蛐
+##蛔
+##蛙
+##蛛
+##蛟
+##蛤
+##蛭
+##蛮
+##蛰
+##蛳
+##蛹
+##蛻
+##蛾
+##蜀
+##蜂
+##蜃
+##蜆
+##蜇
+##蜈
+##蜊
+##蜍
+##蜒
+##蜓
+##蜕
+##蜗
+##蜘
+##蜚
+##蜜
+##蜡
+##蜢
+##蜥
+##蜱
+##蜴
+##蜷
+##蜻
+##蜿
+##蝇
+##蝈
+##蝉
+##蝌
+##蝎
+##蝕
+##蝗
+##蝙
+##蝟
+##蝠
+##蝦
+##蝨
+##蝴
+##蝶
+##蝸
+##蝼
+##螂
+##螃
+##融
+##螞
+##螢
+##螨
+##螯
+##螳
+##螺
+##蟀
+##蟄
+##蟆
+##蟋
+##蟎
+##蟑
+##蟒
+##蟠
+##蟬
+##蟲
+##蟹
+##蟻
+##蟾
+##蠅
+##蠍
+##蠔
+##蠕
+##蠛
+##蠟
+##蠡
+##蠢
+##蠣
+##蠱
+##蠶
+##蠹
+##蠻
+##血
+##衄
+##衅
+##衆
+##行
+##衍
+##術
+##衔
+##街
+##衙
+##衛
+##衝
+##衞
+##衡
+##衢
+##衣
+##补
+##表
+##衩
+##衫
+##衬
+##衮
+##衰
+##衲
+##衷
+##衹
+##衾
+##衿
+##袁
+##袂
+##袄
+##袅
+##袈
+##袋
+##袍
+##袒
+##袖
+##袜
+##袞
+##袤
+##袪
+##被
+##袭
+##袱
+##裁
+##裂
+##装
+##裆
+##裊
+##裏
+##裔
+##裕
+##裘
+##裙
+##補
+##裝
+##裟
+##裡
+##裤
+##裨
+##裱
+##裳
+##裴
+##裸
+##裹
+##製
+##裾
+##褂
+##複
+##褐
+##褒
+##褓
+##褔
+##褚
+##褥
+##褪
+##褫
+##褲
+##褶
+##褻
+##襁
+##襄
+##襟
+##襠
+##襪
+##襬
+##襯
+##襲
+##西
+##要
+##覃
+##覆
+##覇
+##見
+##規
+##覓
+##視
+##覚
+##覦
+##覧
+##親
+##覬
+##観
+##覷
+##覺
+##覽
+##觀
+##见
+##观
+##规
+##觅
+##视
+##览
+##觉
+##觊
+##觎
+##觐
+##觑
+##角
+##觞
+##解
+##觥
+##触
+##觸
+##言
+##訂
+##計
+##訊
+##討
+##訓
+##訕
+##訖
+##託
+##記
+##訛
+##訝
+##訟
+##訣
+##訥
+##訪
+##設
+##許
+##訳
+##訴
+##訶
+##診
+##註
+##証
+##詆
+##詐
+##詔
+##評
+##詛
+##詞
+##詠
+##詡
+##詢
+##詣
+##試
+##詩
+##詫
+##詬
+##詭
+##詮
+##詰
+##話
+##該
+##詳
+##詹
+##詼
+##誅
+##誇
+##誉
+##誌
+##認
+##誓
+##誕
+##誘
+##語
+##誠
+##誡
+##誣
+##誤
+##誥
+##誦
+##誨
+##說
+##説
+##読
+##誰
+##課
+##誹
+##誼
+##調
+##諄
+##談
+##請
+##諏
+##諒
+##論
+##諗
+##諜
+##諡
+##諦
+##諧
+##諫
+##諭
+##諮
+##諱
+##諳
+##諷
+##諸
+##諺
+##諾
+##謀
+##謁
+##謂
+##謄
+##謊
+##謎
+##謐
+##謔
+##謗
+##謙
+##講
+##謝
+##謠
+##謨
+##謬
+##謹
+##謾
+##譁
+##證
+##譎
+##譏
+##識
+##譙
+##譚
+##譜
+##警
+##譬
+##譯
+##議
+##譲
+##譴
+##護
+##譽
+##讀
+##變
+##讓
+##讚
+##讞
+##计
+##订
+##认
+##讥
+##讧
+##讨
+##让
+##讪
+##讫
+##训
+##议
+##讯
+##记
+##讲
+##讳
+##讴
+##讶
+##讷
+##许
+##讹
+##论
+##讼
+##讽
+##设
+##访
+##诀
+##证
+##诃
+##评
+##诅
+##识
+##诈
+##诉
+##诊
+##诋
+##词
+##诏
+##译
+##试
+##诗
+##诘
+##诙
+##诚
+##诛
+##话
+##诞
+##诟
+##诠
+##诡
+##询
+##诣
+##诤
+##该
+##详
+##诧
+##诩
+##诫
+##诬
+##语
+##误
+##诰
+##诱
+##诲
+##说
+##诵
+##诶
+##请
+##诸
+##诺
+##读
+##诽
+##课
+##诿
+##谀
+##谁
+##调
+##谄
+##谅
+##谆
+##谈
+##谊
+##谋
+##谌
+##谍
+##谎
+##谏
+##谐
+##谑
+##谒
+##谓
+##谔
+##谕
+##谗
+##谘
+##谙
+##谚
+##谛
+##谜
+##谟
+##谢
+##谣
+##谤
+##谥
+##谦
+##谧
+##谨
+##谩
+##谪
+##谬
+##谭
+##谯
+##谱
+##谲
+##谴
+##谶
+##谷
+##豁
+##豆
+##豇
+##豈
+##豉
+##豊
+##豌
+##豎
+##豐
+##豔
+##豚
+##象
+##豢
+##豪
+##豫
+##豬
+##豹
+##豺
+##貂
+##貅
+##貌
+##貓
+##貔
+##貘
+##貝
+##貞
+##負
+##財
+##貢
+##貧
+##貨
+##販
+##貪
+##貫
+##責
+##貯
+##貰
+##貳
+##貴
+##貶
+##買
+##貸
+##費
+##貼
+##貽
+##貿
+##賀
+##賁
+##賂
+##賃
+##賄
+##資
+##賈
+##賊
+##賑
+##賓
+##賜
+##賞
+##賠
+##賡
+##賢
+##賣
+##賤
+##賦
+##質
+##賬
+##賭
+##賴
+##賺
+##購
+##賽
+##贅
+##贈
+##贊
+##贍
+##贏
+##贓
+##贖
+##贛
+##贝
+##贞
+##负
+##贡
+##财
+##责
+##贤
+##败
+##账
+##货
+##质
+##贩
+##贪
+##贫
+##贬
+##购
+##贮
+##贯
+##贰
+##贱
+##贲
+##贴
+##贵
+##贷
+##贸
+##费
+##贺
+##贻
+##贼
+##贾
+##贿
+##赁
+##赂
+##赃
+##资
+##赅
+##赈
+##赊
+##赋
+##赌
+##赎
+##赏
+##赐
+##赓
+##赔
+##赖
+##赘
+##赚
+##赛
+##赝
+##赞
+##赠
+##赡
+##赢
+##赣
+##赤
+##赦
+##赧
+##赫
+##赭
+##走
+##赳
+##赴
+##赵
+##赶
+##起
+##趁
+##超
+##越
+##趋
+##趕
+##趙
+##趟
+##趣
+##趨
+##足
+##趴
+##趵
+##趸
+##趺
+##趾
+##跃
+##跄
+##跆
+##跋
+##跌
+##跎
+##跑
+##跖
+##跚
+##跛
+##距
+##跟
+##跡
+##跤
+##跨
+##跩
+##跪
+##路
+##跳
+##践
+##跷
+##跹
+##跺
+##跻
+##踉
+##踊
+##踌
+##踏
+##踐
+##踝
+##踞
+##踟
+##踢
+##踩
+##踪
+##踮
+##踱
+##踴
+##踵
+##踹
+##蹂
+##蹄
+##蹇
+##蹈
+##蹉
+##蹊
+##蹋
+##蹑
+##蹒
+##蹙
+##蹟
+##蹣
+##蹤
+##蹦
+##蹩
+##蹬
+##蹭
+##蹲
+##蹴
+##蹶
+##蹺
+##蹼
+##蹿
+##躁
+##躇
+##躉
+##躊
+##躋
+##躍
+##躏
+##躪
+##身
+##躬
+##躯
+##躲
+##躺
+##軀
+##車
+##軋
+##軌
+##軍
+##軒
+##軟
+##転
+##軸
+##軼
+##軽
+##軾
+##較
+##載
+##輒
+##輓
+##輔
+##輕
+##輛
+##輝
+##輟
+##輩
+##輪
+##輯
+##輸
+##輻
+##輾
+##輿
+##轄
+##轅
+##轆
+##轉
+##轍
+##轎
+##轟
+##车
+##轧
+##轨
+##轩
+##转
+##轭
+##轮
+##软
+##轰
+##轲
+##轴
+##轶
+##轻
+##轼
+##载
+##轿
+##较
+##辄
+##辅
+##辆
+##辇
+##辈
+##辉
+##辊
+##辍
+##辐
+##辑
+##输
+##辕
+##辖
+##辗
+##辘
+##辙
+##辛
+##辜
+##辞
+##辟
+##辣
+##辦
+##辨
+##辩
+##辫
+##辭
+##辮
+##辯
+##辰
+##辱
+##農
+##边
+##辺
+##辻
+##込
+##辽
+##达
+##迁
+##迂
+##迄
+##迅
+##过
+##迈
+##迎
+##运
+##近
+##返
+##还
+##这
+##进
+##远
+##违
+##连
+##迟
+##迢
+##迤
+##迥
+##迦
+##迩
+##迪
+##迫
+##迭
+##述
+##迴
+##迷
+##迸
+##迹
+##迺
+##追
+##退
+##送
+##适
+##逃
+##逅
+##逆
+##选
+##逊
+##逍
+##透
+##逐
+##递
+##途
+##逕
+##逗
+##這
+##通
+##逛
+##逝
+##逞
+##速
+##造
+##逢
+##連
+##逮
+##週
+##進
+##逵
+##逶
+##逸
+##逻
+##逼
+##逾
+##遁
+##遂
+##遅
+##遇
+##遊
+##運
+##遍
+##過
+##遏
+##遐
+##遑
+##遒
+##道
+##達
+##違
+##遗
+##遙
+##遛
+##遜
+##遞
+##遠
+##遢
+##遣
+##遥
+##遨
+##適
+##遭
+##遮
+##遲
+##遴
+##遵
+##遶
+##遷
+##選
+##遺
+##遼
+##遽
+##避
+##邀
+##邁
+##邂
+##邃
+##還
+##邇
+##邈
+##邊
+##邋
+##邏
+##邑
+##邓
+##邕
+##邛
+##邝
+##邢
+##那
+##邦
+##邨
+##邪
+##邬
+##邮
+##邯
+##邰
+##邱
+##邳
+##邵
+##邸
+##邹
+##邺
+##邻
+##郁
+##郅
+##郊
+##郎
+##郑
+##郜
+##郝
+##郡
+##郢
+##郤
+##郦
+##郧
+##部
+##郫
+##郭
+##郴
+##郵
+##郷
+##郸
+##都
+##鄂
+##鄉
+##鄒
+##鄔
+##鄙
+##鄞
+##鄢
+##鄧
+##鄭
+##鄰
+##鄱
+##鄲
+##鄺
+##酉
+##酊
+##酋
+##酌
+##配
+##酐
+##酒
+##酗
+##酚
+##酝
+##酢
+##酣
+##酥
+##酩
+##酪
+##酬
+##酮
+##酯
+##酰
+##酱
+##酵
+##酶
+##酷
+##酸
+##酿
+##醃
+##醇
+##醉
+##醋
+##醍
+##醐
+##醒
+##醚
+##醛
+##醜
+##醞
+##醣
+##醪
+##醫
+##醬
+##醮
+##醯
+##醴
+##醺
+##釀
+##釁
+##采
+##釉
+##释
+##釋
+##里
+##重
+##野
+##量
+##釐
+##金
+##釗
+##釘
+##釜
+##針
+##釣
+##釦
+##釧
+##釵
+##鈀
+##鈉
+##鈍
+##鈎
+##鈔
+##鈕
+##鈞
+##鈣
+##鈦
+##鈪
+##鈴
+##鈺
+##鈾
+##鉀
+##鉄
+##鉅
+##鉉
+##鉑
+##鉗
+##鉚
+##鉛
+##鉤
+##鉴
+##鉻
+##銀
+##銃
+##銅
+##銑
+##銓
+##銖
+##銘
+##銜
+##銬
+##銭
+##銮
+##銳
+##銷
+##銹
+##鋁
+##鋅
+##鋒
+##鋤
+##鋪
+##鋰
+##鋸
+##鋼
+##錄
+##錐
+##錘
+##錚
+##錠
+##錢
+##錦
+##錨
+##錫
+##錮
+##錯
+##録
+##錳
+##錶
+##鍊
+##鍋
+##鍍
+##鍛
+##鍥
+##鍰
+##鍵
+##鍺
+##鍾
+##鎂
+##鎊
+##鎌
+##鎏
+##鎔
+##鎖
+##鎗
+##鎚
+##鎧
+##鎬
+##鎮
+##鎳
+##鏈
+##鏖
+##鏗
+##鏘
+##鏞
+##鏟
+##鏡
+##鏢
+##鏤
+##鏽
+##鐘
+##鐮
+##鐲
+##鐳
+##鐵
+##鐸
+##鐺
+##鑄
+##鑊
+##鑑
+##鑒
+##鑣
+##鑫
+##鑰
+##鑲
+##鑼
+##鑽
+##鑾
+##鑿
+##针
+##钉
+##钊
+##钎
+##钏
+##钒
+##钓
+##钗
+##钙
+##钛
+##钜
+##钝
+##钞
+##钟
+##钠
+##钡
+##钢
+##钣
+##钤
+##钥
+##钦
+##钧
+##钨
+##钩
+##钮
+##钯
+##钰
+##钱
+##钳
+##钴
+##钵
+##钺
+##钻
+##钼
+##钾
+##钿
+##铀
+##铁
+##铂
+##铃
+##铄
+##铅
+##铆
+##铉
+##铎
+##铐
+##铛
+##铜
+##铝
+##铠
+##铡
+##铢
+##铣
+##铤
+##铨
+##铩
+##铬
+##铭
+##铮
+##铰
+##铲
+##铵
+##银
+##铸
+##铺
+##链
+##铿
+##销
+##锁
+##锂
+##锄
+##锅
+##锆
+##锈
+##锉
+##锋
+##锌
+##锏
+##锐
+##锑
+##错
+##锚
+##锟
+##锡
+##锢
+##锣
+##锤
+##锥
+##锦
+##锭
+##键
+##锯
+##锰
+##锲
+##锵
+##锹
+##锺
+##锻
+##镀
+##镁
+##镂
+##镇
+##镉
+##镌
+##镍
+##镐
+##镑
+##镕
+##镖
+##镗
+##镛
+##镜
+##镣
+##镭
+##镯
+##镰
+##镳
+##镶
+##長
+##长
+##門
+##閃
+##閉
+##開
+##閎
+##閏
+##閑
+##閒
+##間
+##閔
+##閘
+##閡
+##関
+##閣
+##閥
+##閨
+##閩
+##閱
+##閲
+##閹
+##閻
+##閾
+##闆
+##闇
+##闊
+##闌
+##闍
+##闔
+##闕
+##闖
+##闘
+##關
+##闡
+##闢
+##门
+##闪
+##闫
+##闭
+##问
+##闯
+##闰
+##闲
+##间
+##闵
+##闷
+##闸
+##闹
+##闺
+##闻
+##闽
+##闾
+##阀
+##阁
+##阂
+##阅
+##阆
+##阇
+##阈
+##阉
+##阎
+##阐
+##阑
+##阔
+##阕
+##阖
+##阙
+##阚
+##阜
+##队
+##阡
+##阪
+##阮
+##阱
+##防
+##阳
+##阴
+##阵
+##阶
+##阻
+##阿
+##陀
+##陂
+##附
+##际
+##陆
+##陇
+##陈
+##陋
+##陌
+##降
+##限
+##陕
+##陛
+##陝
+##陞
+##陟
+##陡
+##院
+##陣
+##除
+##陨
+##险
+##陪
+##陰
+##陲
+##陳
+##陵
+##陶
+##陷
+##陸
+##険
+##陽
+##隅
+##隆
+##隈
+##隊
+##隋
+##隍
+##階
+##随
+##隐
+##隔
+##隕
+##隘
+##隙
+##際
+##障
+##隠
+##隣
+##隧
+##隨
+##險
+##隱
+##隴
+##隶
+##隸
+##隻
+##隼
+##隽
+##难
+##雀
+##雁
+##雄
+##雅
+##集
+##雇
+##雉
+##雋
+##雌
+##雍
+##雎
+##雏
+##雑
+##雒
+##雕
+##雖
+##雙
+##雛
+##雜
+##雞
+##離
+##難
+##雨
+##雪
+##雯
+##雰
+##雲
+##雳
+##零
+##雷
+##雹
+##電
+##雾
+##需
+##霁
+##霄
+##霆
+##震
+##霈
+##霉
+##霊
+##霍
+##霎
+##霏
+##霑
+##霓
+##霖
+##霜
+##霞
+##霧
+##霭
+##霰
+##露
+##霸
+##霹
+##霽
+##霾
+##靂
+##靄
+##靈
+##青
+##靓
+##靖
+##静
+##靚
+##靛
+##靜
+##非
+##靠
+##靡
+##面
+##靥
+##靦
+##革
+##靳
+##靴
+##靶
+##靼
+##鞅
+##鞋
+##鞍
+##鞏
+##鞑
+##鞘
+##鞠
+##鞣
+##鞦
+##鞭
+##韆
+##韋
+##韌
+##韓
+##韜
+##韦
+##韧
+##韩
+##韬
+##韭
+##音
+##韵
+##韶
+##韻
+##響
+##頁
+##頂
+##頃
+##項
+##順
+##須
+##頌
+##預
+##頑
+##頒
+##頓
+##頗
+##領
+##頜
+##頡
+##頤
+##頫
+##頭
+##頰
+##頷
+##頸
+##頹
+##頻
+##頼
+##顆
+##題
+##額
+##顎
+##顏
+##顔
+##願
+##顛
+##類
+##顧
+##顫
+##顯
+##顱
+##顴
+##页
+##顶
+##顷
+##项
+##顺
+##须
+##顼
+##顽
+##顾
+##顿
+##颁
+##颂
+##预
+##颅
+##领
+##颇
+##颈
+##颉
+##颊
+##颌
+##颍
+##颐
+##频
+##颓
+##颔
+##颖
+##颗
+##题
+##颚
+##颛
+##颜
+##额
+##颞
+##颠
+##颡
+##颢
+##颤
+##颦
+##颧
+##風
+##颯
+##颱
+##颳
+##颶
+##颼
+##飄
+##飆
+##风
+##飒
+##飓
+##飕
+##飘
+##飙
+##飚
+##飛
+##飞
+##食
+##飢
+##飨
+##飩
+##飪
+##飯
+##飲
+##飼
+##飽
+##飾
+##餃
+##餅
+##餉
+##養
+##餌
+##餐
+##餒
+##餓
+##餘
+##餚
+##餛
+##餞
+##餡
+##館
+##餮
+##餵
+##餾
+##饅
+##饈
+##饋
+##饌
+##饍
+##饑
+##饒
+##饕
+##饗
+##饞
+##饥
+##饨
+##饪
+##饬
+##饭
+##饮
+##饯
+##饰
+##饱
+##饲
+##饴
+##饵
+##饶
+##饷
+##饺
+##饼
+##饽
+##饿
+##馀
+##馁
+##馄
+##馅
+##馆
+##馈
+##馋
+##馍
+##馏
+##馒
+##馔
+##首
+##馗
+##香
+##馥
+##馨
+##馬
+##馭
+##馮
+##馳
+##馴
+##駁
+##駄
+##駅
+##駆
+##駐
+##駒
+##駕
+##駛
+##駝
+##駭
+##駱
+##駿
+##騁
+##騎
+##騏
+##験
+##騙
+##騨
+##騰
+##騷
+##驀
+##驅
+##驊
+##驍
+##驒
+##驕
+##驗
+##驚
+##驛
+##驟
+##驢
+##驥
+##马
+##驭
+##驮
+##驯
+##驰
+##驱
+##驳
+##驴
+##驶
+##驷
+##驸
+##驹
+##驻
+##驼
+##驾
+##驿
+##骁
+##骂
+##骄
+##骅
+##骆
+##骇
+##骈
+##骊
+##骋
+##验
+##骏
+##骐
+##骑
+##骗
+##骚
+##骛
+##骜
+##骞
+##骠
+##骡
+##骤
+##骥
+##骧
+##骨
+##骯
+##骰
+##骶
+##骷
+##骸
+##骼
+##髂
+##髅
+##髋
+##髏
+##髒
+##髓
+##體
+##髖
+##高
+##髦
+##髪
+##髮
+##髯
+##髻
+##鬃
+##鬆
+##鬍
+##鬓
+##鬚
+##鬟
+##鬢
+##鬣
+##鬥
+##鬧
+##鬱
+##鬼
+##魁
+##魂
+##魄
+##魅
+##魇
+##魍
+##魏
+##魔
+##魘
+##魚
+##魯
+##魷
+##鮑
+##鮨
+##鮪
+##鮭
+##鮮
+##鯉
+##鯊
+##鯖
+##鯛
+##鯨
+##鯰
+##鯽
+##鰍
+##鰓
+##鰭
+##鰲
+##鰻
+##鰾
+##鱈
+##鱉
+##鱔
+##鱗
+##鱷
+##鱸
+##鱼
+##鱿
+##鲁
+##鲈
+##鲍
+##鲑
+##鲛
+##鲜
+##鲟
+##鲢
+##鲤
+##鲨
+##鲫
+##鲱
+##鲲
+##鲶
+##鲷
+##鲸
+##鳃
+##鳄
+##鳅
+##鳌
+##鳍
+##鳕
+##鳖
+##鳗
+##鳝
+##鳞
+##鳥
+##鳩
+##鳳
+##鳴
+##鳶
+##鴉
+##鴕
+##鴛
+##鴦
+##鴨
+##鴻
+##鴿
+##鵑
+##鵜
+##鵝
+##鵡
+##鵬
+##鵰
+##鵲
+##鶘
+##鶩
+##鶯
+##鶴
+##鷗
+##鷲
+##鷹
+##鷺
+##鸚
+##鸞
+##鸟
+##鸠
+##鸡
+##鸢
+##鸣
+##鸥
+##鸦
+##鸨
+##鸪
+##鸭
+##鸯
+##鸳
+##鸵
+##鸽
+##鸾
+##鸿
+##鹂
+##鹃
+##鹄
+##鹅
+##鹈
+##鹉
+##鹊
+##鹌
+##鹏
+##鹑
+##鹕
+##鹘
+##鹜
+##鹞
+##鹤
+##鹦
+##鹧
+##鹫
+##鹭
+##鹰
+##鹳
+##鹵
+##鹹
+##鹼
+##鹽
+##鹿
+##麂
+##麋
+##麒
+##麓
+##麗
+##麝
+##麟
+##麥
+##麦
+##麩
+##麴
+##麵
+##麸
+##麺
+##麻
+##麼
+##麽
+##麾
+##黃
+##黄
+##黍
+##黎
+##黏
+##黑
+##黒
+##黔
+##默
+##黛
+##黜
+##黝
+##點
+##黠
+##黨
+##黯
+##黴
+##鼋
+##鼎
+##鼐
+##鼓
+##鼠
+##鼬
+##鼹
+##鼻
+##鼾
+##齁
+##齊
+##齋
+##齐
+##齒
+##齡
+##齢
+##齣
+##齦
+##齿
+##龄
+##龅
+##龈
+##龊
+##龋
+##龌
+##龍
+##龐
+##龔
+##龕
+##龙
+##龚
+##龛
+##龜
+##龟
+##︰
+##︱
+##︶
+##︿
+##﹁
+##﹂
+##﹍
+##﹏
+##﹐
+##﹑
+##﹒
+##﹔
+##﹕
+##﹖
+##﹗
+##﹙
+##﹚
+##﹝
+##﹞
+##﹡
+##﹣
+##！
+##＂
+##＃
+##＄
+##％
+##＆
+##＇
+##（
+##）
+##＊
+##，
+##－
+##．
+##／
+##：
+##；
+##＜
+##？
+##＠
+##［
+##＼
+##］
+##＾
+##＿
+##｀
+##ｆ
+##ｈ
+##ｊ
+##ｕ
+##ｗ
+##ｚ
+##｛
+##｝
+##｡
+##｢
+##｣
+##､
+##･
+##ｯ
+##ｰ
+##ｲ
+##ｸ
+##ｼ
+##ｽ
+##ﾄ
+##ﾉ
+##ﾌ
+##ﾗ
+##ﾙ
+##ﾝ
+##ﾞ
+##ﾟ
+##￣
+##￥
+##👍
+##🔥
+##😂
+##😎
diff --git a/fengshen/workspace/erlangshen-bert-base/pretrain/config.json b/fengshen/workspace/erlangshen-bert-base/pretrain/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fe73faf0b8edec537616f52988f9e985e940526
--- /dev/null
+++ b/fengshen/workspace/erlangshen-bert-base/pretrain/config.json
@@ -0,0 +1,18 @@
+{
+    "vocab_size": 12800,
+    "hidden_size": 768,
+    "num_hidden_layers": 12,
+    "num_attention_heads": 12,
+    "hidden_act": "gelu_new",
+    "intermediate_size": 3072,
+    "hidden_dropout_prob": 0.1,
+    "attention_probs_dropout_prob": 0.1,
+    "max_position_embeddings": 512,
+    "type_vocab_size": 2,
+    "initializer_range": 0.02,
+    "layer_norm_eps": 1e-12,
+    "gradient_checkpointing": false,
+    "position_embedding_type": "absolute",
+    "use_cache": false,
+    "model_type": "megatron-bert"
+}
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-bert-base/pretrain/special_tokens_map.json b/fengshen/workspace/erlangshen-bert-base/pretrain/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7b0375001f109a6b8873d756ad4f7bbb15fbaa5
--- /dev/null
+++ b/fengshen/workspace/erlangshen-bert-base/pretrain/special_tokens_map.json
@@ -0,0 +1 @@
+{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-bert-base/pretrain/tokenizer_config.json b/fengshen/workspace/erlangshen-bert-base/pretrain/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..53d88ea9bb5c978402b7d9bb4a80690171e5491c
--- /dev/null
+++ b/fengshen/workspace/erlangshen-bert-base/pretrain/tokenizer_config.json
@@ -0,0 +1,15 @@
+{
+    "do_lower_case": true,
+    "do_basic_tokenize": true,
+    "never_split": null,
+    "unk_token": "[UNK]",
+    "sep_token": "[SEP]",
+    "pad_token": "[PAD]",
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]",
+    "tokenize_chinese_chars": true,
+    "strip_accents": null,
+    "special_tokens_map_file": null,
+    "name_or_path": "/cognitive_comp/gaoxinyu/pretrained_model/bert-1.3B",
+    "tokenizer_class": "BertTokenizer"
+}
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-bert-base/pretrain/vocab.txt b/fengshen/workspace/erlangshen-bert-base/pretrain/vocab.txt
new file mode 100644
index 0000000000000000000000000000000000000000..437c75cb0090ba6d449e478fae3f7421ab1de961
--- /dev/null
+++ b/fengshen/workspace/erlangshen-bert-base/pretrain/vocab.txt
@@ -0,0 +1,12800 @@
+[PAD]
+[CLS]
+[SEP]
+[UNK]
+[MASK]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+`
+{
+|
+}
+~
+·
+–
+—
+‘
+’
+‛
+“
+”
+„
+‟
+…
+‧
+、
+。
+〃
+〈
+〉
+《
+》
+「
+」
+『
+』
+【
+】
+〔
+〕
+〖
+〗
+〜
+〝
+〞
+〰
+﹏
+﹑
+﹔
+！
+＂
+＃
+＄
+％
+＆
+＇
+（
+）
+＊
+＋
+，
+－
+：
+；
+＜
+＝
+＞
+？
+＠
+［
+＼
+］
+＾
+＿
+｀
+｛
+｜
+｝
+～
+｡
+｢
+｣
+､
+0
+##0
+1
+##1
+2
+##2
+3
+##3
+4
+##4
+5
+##5
+6
+##6
+7
+##7
+8
+##8
+9
+##9
+一
+丁
+七
+丄
+丅
+丆
+万
+丈
+三
+上
+下
+丌
+不
+与
+丏
+丐
+丑
+专
+且
+丕
+世
+丘
+丙
+业
+丛
+东
+丝
+丞
+両
+丢
+两
+严
+丧
+丨
+个
+丫
+中
+丰
+串
+临
+丶
+丸
+丹
+为
+主
+丼
+丽
+举
+丿
+乂
+乃
+乄
+久
+乇
+么
+义
+之
+乌
+乍
+乎
+乏
+乐
+乒
+乓
+乔
+乖
+乗
+乘
+乙
+乛
+乜
+九
+乞
+也
+习
+乡
+书
+乩
+买
+乱
+乳
+乸
+乾
+亀
+了
+亇
+予
+争
+亊
+事
+二
+亍
+于
+亏
+云
+互
+亓
+五
+井
+亘
+亚
+些
+亜
+亟
+亡
+亢
+交
+亥
+亦
+产
+亨
+亩
+享
+京
+亭
+亮
+亲
+亳
+亵
+亶
+亸
+亹
+人
+亻
+亽
+亾
+亿
+什
+仁
+仂
+仃
+仄
+仅
+仆
+仇
+仉
+今
+介
+仍
+从
+仏
+仑
+仓
+仔
+仕
+他
+仗
+付
+仙
+仝
+仞
+仟
+仡
+代
+令
+以
+仨
+仩
+仪
+仫
+们
+仮
+仰
+仲
+仳
+仵
+件
+价
+任
+仼
+份
+仿
+企
+伉
+伊
+伋
+伍
+伎
+伏
+伐
+休
+众
+优
+伙
+会
+伛
+伝
+伞
+伟
+传
+伢
+伤
+伥
+伦
+伧
+伪
+伫
+伯
+估
+伱
+伲
+伴
+伶
+伷
+伸
+伺
+似
+伽
+伾
+佀
+佃
+但
+佉
+位
+低
+住
+佐
+佑
+体
+何
+佗
+佘
+余
+佚
+佛
+作
+佝
+佞
+佟
+你
+佢
+佣
+佤
+佥
+佧
+佩
+佬
+佯
+佰
+佳
+佴
+佶
+佷
+佺
+佻
+佼
+佾
+使
+侁
+侂
+侃
+侄
+來
+侈
+侉
+例
+侍
+侏
+侑
+侔
+侗
+侘
+供
+依
+侠
+価
+侣
+侥
+侦
+侧
+侨
+侩
+侪
+侬
+侮
+侯
+侵
+便
+促
+俄
+俅
+俊
+俎
+俏
+俐
+俑
+俗
+俘
+俚
+俛
+俜
+保
+俞
+俟
+信
+俢
+俣
+俤
+俦
+俨
+俩
+俪
+俬
+俭
+修
+俯
+俱
+俳
+俶
+俸
+俺
+俾
+倅
+倌
+倍
+倏
+倒
+倓
+倔
+倕
+倘
+候
+倚
+倜
+倞
+借
+倡
+値
+倥
+倦
+倧
+倨
+倩
+倪
+倬
+倭
+倮
+倶
+债
+倻
+值
+倾
+偁
+偃
+假
+偈
+偌
+偍
+偎
+偏
+偓
+偕
+做
+停
+偠
+健
+偬
+偰
+偲
+偶
+偷
+偻
+偾
+偿
+傀
+傅
+傈
+傉
+傍
+傒
+傕
+傣
+傥
+傧
+储
+傩
+催
+傲
+傺
+傻
+働
+像
+僖
+僚
+僜
+僣
+僦
+僧
+僬
+僭
+僮
+僰
+僳
+僵
+僻
+僾
+儁
+儆
+儇
+儋
+儒
+儞
+儡
+儿
+兀
+允
+元
+兄
+充
+兆
+先
+光
+克
+免
+兎
+児
+兑
+兔
+兕
+兖
+党
+兜
+兢
+入
+全
+八
+公
+六
+兮
+兰
+共
+兲
+关
+兴
+兵
+其
+具
+典
+兹
+养
+兼
+兽
+冀
+内
+円
+冇
+冈
+冉
+册
+再
+冏
+冒
+冕
+冗
+写
+冚
+军
+农
+冠
+冢
+冤
+冥
+冧
+冨
+冬
+冮
+冯
+冰
+冲
+决
+冴
+况
+冶
+冷
+冻
+冼
+冽
+净
+凃
+凄
+准
+凇
+凉
+凊
+凋
+凌
+减
+凑
+凖
+凛
+凝
+几
+凡
+凤
+処
+凪
+凫
+凭
+凯
+凰
+凳
+凶
+凸
+凹
+出
+击
+凼
+函
+凿
+刀
+刁
+刂
+刃
+分
+切
+刈
+刊
+刋
+刍
+刎
+刑
+划
+刖
+列
+刘
+则
+刚
+创
+初
+删
+判
+刨
+利
+别
+刬
+刭
+刮
+到
+刳
+制
+刷
+券
+刹
+刺
+刻
+刽
+刿
+剀
+剁
+剂
+剃
+剅
+削
+剌
+前
+剐
+剑
+剔
+剖
+剜
+剞
+剡
+剣
+剥
+剧
+剩
+剪
+副
+割
+剽
+剿
+劂
+劄
+劈
+劏
+劓
+力
+劝
+办
+功
+加
+务
+劢
+劣
+动
+助
+努
+劫
+劬
+劭
+励
+劲
+劳
+劵
+効
+劻
+劼
+劾
+势
+勃
+勅
+勇
+勉
+勋
+勍
+勐
+勑
+勒
+勔
+勖
+勘
+募
+勠
+勤
+勰
+勲
+勳
+勷
+勺
+勾
+勿
+匀
+匂
+匄
+包
+匆
+匈
+匋
+匍
+匏
+匐
+匕
+化
+北
+匙
+匚
+匜
+匝
+匠
+匡
+匣
+匦
+匪
+匮
+匹
+区
+医
+匾
+匿
+區
+十
+千
+卅
+升
+午
+卉
+半
+卌
+卍
+华
+协
+卐
+卑
+卒
+卓
+单
+卖
+南
+単
+博
+卜
+卝
+卞
+卟
+占
+卡
+卢
+卣
+卤
+卦
+卧
+卨
+卫
+卬
+卮
+卯
+印
+危
+卲
+即
+却
+卵
+卷
+卸
+卺
+卽
+卿
+厂
+厄
+厅
+历
+厉
+压
+厌
+厍
+厓
+厔
+厕
+厘
+厚
+厝
+原
+厢
+厣
+厥
+厦
+厨
+厩
+厮
+厶
+去
+县
+叁
+参
+叆
+又
+叉
+及
+友
+双
+反
+収
+发
+叒
+叔
+叕
+取
+受
+变
+叙
+叛
+叟
+叠
+叡
+口
+古
+句
+另
+叨
+叩
+只
+叫
+召
+叭
+叮
+可
+台
+叱
+史
+右
+叵
+叶
+号
+司
+叹
+叻
+叼
+叽
+吁
+吃
+各
+吅
+吆
+吇
+合
+吉
+吊
+吋
+同
+名
+后
+吏
+吐
+向
+吒
+吓
+吔
+吕
+吖
+吗
+吙
+吚
+君
+吝
+吞
+吟
+吠
+吡
+吥
+否
+吧
+吨
+吩
+含
+听
+吭
+吮
+启
+吱
+吲
+吴
+吵
+吸
+吹
+吻
+吼
+吽
+吾
+吿
+呀
+呃
+呆
+呈
+呉
+告
+呋
+呎
+呐
+呑
+呒
+呓
+呔
+呕
+呖
+呗
+员
+呙
+呛
+呜
+呢
+呣
+呤
+呦
+周
+呪
+呬
+呯
+呱
+呲
+味
+呴
+呵
+呶
+呷
+呸
+呻
+呼
+命
+呾
+咀
+咁
+咂
+咄
+咆
+咋
+和
+咎
+咏
+咐
+咒
+咔
+咕
+咖
+咗
+咘
+咙
+咚
+咛
+咝
+咢
+咣
+咤
+咥
+咦
+咧
+咨
+咩
+咪
+咫
+咬
+咭
+咯
+咱
+咲
+咳
+咴
+咸
+咻
+咽
+咾
+咿
+哀
+品
+哂
+哄
+哆
+哇
+哈
+哉
+哋
+哌
+响
+哎
+哏
+哐
+哑
+哒
+哓
+哔
+哕
+哗
+哙
+哚
+哝
+哞
+哟
+哥
+哦
+哧
+哨
+哩
+哪
+哭
+哮
+哲
+哺
+哼
+哽
+哿
+唁
+唃
+唆
+唇
+唉
+唎
+唏
+唐
+唑
+唔
+唛
+唠
+唢
+唤
+唦
+唧
+唬
+售
+唯
+唰
+唱
+唳
+唵
+唷
+唻
+唼
+唾
+唿
+啁
+啃
+啄
+啅
+商
+啉
+啊
+啐
+啓
+啕
+啖
+啜
+啝
+啡
+啤
+啥
+啦
+啧
+啪
+啫
+啬
+啭
+啮
+啯
+啰
+啱
+啲
+啵
+啶
+啷
+啸
+啻
+啼
+啾
+喀
+喁
+喂
+喃
+善
+喆
+喇
+喈
+喉
+喊
+喋
+喎
+喏
+喑
+喔
+喘
+喙
+喜
+喝
+喟
+喦
+喧
+喰
+喱
+喳
+喵
+営
+喷
+喹
+喺
+喻
+喽
+喾
+嗄
+嗅
+嗉
+嗌
+嗍
+嗑
+嗒
+嗓
+嗔
+嗖
+嗜
+嗝
+嗞
+嗟
+嗡
+嗣
+嗤
+嗥
+嗦
+嗨
+嗪
+嗫
+嗬
+嗮
+嗯
+嗰
+嗲
+嗳
+嗵
+嗷
+嗽
+嗾
+嘀
+嘁
+嘅
+嘈
+嘉
+嘌
+嘎
+嘏
+嘘
+嘚
+嘛
+嘞
+嘟
+嘢
+嘣
+嘤
+嘦
+嘧
+嘬
+嘭
+嘱
+嘲
+嘴
+嘶
+嘹
+嘻
+嘿
+噁
+噉
+噌
+噎
+噐
+噔
+噗
+噘
+噙
+噜
+噢
+噤
+器
+噩
+噪
+噫
+噬
+噱
+噶
+噻
+噼
+嚅
+嚈
+嚎
+嚏
+嚐
+嚒
+嚓
+嚟
+嚣
+嚧
+嚩
+嚭
+嚯
+嚷
+嚼
+囊
+囍
+囔
+囖
+囗
+囘
+囚
+四
+囝
+回
+囟
+因
+囡
+团
+団
+囤
+囧
+囫
+囬
+园
+囯
+困
+囱
+囲
+図
+围
+囵
+囷
+囹
+固
+国
+图
+囿
+圃
+圄
+圆
+圈
+圉
+圊
+國
+圌
+圏
+圜
+圝
+圞
+土
+圣
+圧
+在
+圩
+圪
+圬
+圭
+圮
+圯
+地
+圳
+圹
+场
+圻
+圾
+址
+坂
+均
+坊
+坌
+坍
+坎
+坏
+坐
+坑
+块
+坚
+坛
+坜
+坝
+坞
+坟
+坠
+坡
+坤
+坦
+坨
+坩
+坪
+坫
+坬
+坭
+坯
+坳
+坷
+坻
+坼
+垂
+垃
+垄
+垅
+垆
+型
+垌
+垍
+垒
+垓
+垕
+垚
+垛
+垞
+垟
+垠
+垡
+垢
+垣
+垤
+垦
+垧
+垩
+垫
+垭
+垮
+垱
+垲
+垴
+垵
+垸
+埂
+埃
+埇
+埈
+埋
+埌
+城
+埏
+埒
+埔
+埕
+埗
+埘
+埙
+埚
+埜
+埝
+域
+埠
+埤
+埭
+埯
+埴
+埵
+埸
+培
+基
+埼
+埽
+堀
+堂
+堃
+堆
+堇
+堈
+堉
+堋
+堌
+堍
+堑
+堕
+堙
+堞
+堠
+堡
+堤
+堨
+堪
+堰
+堵
+堺
+堽
+塁
+塄
+塅
+塆
+塌
+塍
+塑
+塔
+塘
+塚
+塝
+塞
+塩
+填
+塬
+塭
+塱
+塽
+塾
+墀
+墁
+境
+墅
+墉
+墒
+墓
+墕
+増
+墘
+墙
+增
+墟
+墨
+墩
+墫
+壁
+壅
+壆
+壊
+壑
+壕
+壤
+士
+壬
+壮
+声
+壱
+売
+壳
+壶
+壸
+壹
+处
+备
+変
+复
+夏
+夑
+夔
+夕
+外
+夙
+多
+夜
+够
+夤
+夥
+大
+夨
+天
+太
+夫
+夬
+夭
+央
+夯
+失
+夲
+头
+夶
+夷
+夸
+夹
+夺
+夼
+奀
+奁
+奂
+奄
+奇
+奈
+奉
+奋
+奌
+奎
+奏
+契
+奔
+奕
+奖
+套
+奘
+奚
+奠
+奢
+奥
+奨
+奭
+女
+奴
+奶
+奸
+她
+好
+妁
+如
+妃
+妄
+妆
+妇
+妈
+妊
+妍
+妒
+妓
+妖
+妗
+妘
+妙
+妞
+妠
+妣
+妤
+妥
+妨
+妩
+妪
+妫
+妬
+妮
+妯
+妲
+妹
+妺
+妻
+妼
+妾
+姁
+姆
+姈
+姉
+姊
+始
+姐
+姑
+姒
+姓
+委
+姗
+姘
+姚
+姜
+姝
+姞
+姣
+姤
+姥
+姨
+姫
+姬
+姮
+姱
+姵
+姹
+姻
+姽
+姿
+威
+娃
+娄
+娅
+娆
+娇
+娈
+娉
+娌
+娑
+娓
+娘
+娜
+娟
+娠
+娡
+娣
+娥
+娩
+娭
+娱
+娲
+娴
+娶
+娼
+婀
+婆
+婉
+婊
+婕
+婚
+婠
+婢
+婧
+婪
+婬
+婳
+婴
+婵
+婶
+婷
+婺
+婻
+婼
+婿
+媄
+媒
+媗
+媚
+媛
+媜
+媞
+媪
+媲
+媳
+媵
+媸
+媺
+媾
+嫁
+嫂
+嫄
+嫉
+嫌
+嫒
+嫔
+嫖
+嫘
+嫚
+嫠
+嫡
+嫣
+嫦
+嫩
+嫪
+嫫
+嫰
+嫱
+嫲
+嫽
+嬅
+嬉
+嬖
+嬗
+嬛
+嬜
+嬢
+嬲
+嬴
+嬷
+嬾
+嬿
+孀
+子
+孑
+孒
+孓
+孔
+孕
+孖
+字
+存
+孙
+孚
+孛
+孜
+孝
+孟
+孢
+季
+孤
+孥
+学
+孩
+孪
+孬
+孰
+孱
+孳
+孵
+孺
+孽
+宀
+宁
+它
+宄
+宅
+宇
+守
+安
+宋
+完
+宍
+宏
+宓
+宕
+宗
+官
+宙
+定
+宛
+宜
+宝
+实
+実
+宠
+审
+客
+宣
+室
+宥
+宦
+宪
+宫
+宬
+宰
+害
+宴
+宵
+家
+宸
+容
+宽
+宾
+宿
+寀
+寂
+寃
+寄
+寅
+密
+寇
+富
+寐
+寒
+寓
+寔
+寘
+寛
+寝
+寞
+察
+寡
+寤
+寥
+寨
+寮
+寯
+寰
+寳
+寸
+对
+寺
+寻
+导
+対
+寿
+封
+専
+射
+尅
+将
+尉
+尊
+對
+小
+尐
+少
+尒
+尓
+尔
+尕
+尖
+尘
+尙
+尚
+尛
+尜
+尝
+尢
+尤
+尧
+尨
+尪
+尬
+就
+尴
+尸
+尹
+尺
+尻
+尼
+尽
+尾
+尿
+局
+屁
+层
+屃
+屄
+居
+屈
+屉
+届
+屋
+屌
+屍
+屎
+屏
+屐
+屑
+展
+屙
+属
+屠
+屡
+屣
+履
+屦
+屮
+屯
+山
+屹
+屺
+屾
+屿
+岀
+岁
+岂
+岈
+岌
+岐
+岑
+岔
+岕
+岖
+岗
+岘
+岙
+岚
+岛
+岜
+岞
+岢
+岣
+岩
+岫
+岬
+岭
+岱
+岳
+岵
+岷
+岸
+岺
+岽
+岿
+峁
+峄
+峇
+峋
+峒
+峕
+峙
+峠
+峡
+峣
+峤
+峥
+峦
+峨
+峩
+峪
+峭
+峯
+峰
+峻
+崀
+崁
+崂
+崃
+崄
+崆
+崇
+崎
+崐
+崑
+崔
+崖
+崚
+崛
+崞
+崟
+崤
+崦
+崧
+崩
+崭
+崮
+崴
+崽
+崾
+嵇
+嵊
+嵋
+嵌
+嵎
+嵖
+嵗
+嵘
+嵚
+嵛
+嵝
+嵩
+嵬
+嵯
+嵴
+嶂
+嶋
+嶙
+嶝
+嶲
+嶷
+巂
+巅
+巇
+巉
+巍
+巎
+巘
+巜
+川
+州
+巡
+巢
+巣
+工
+左
+巧
+巨
+巩
+巫
+差
+巯
+己
+已
+巳
+巴
+巷
+巻
+巽
+巾
+巿
+币
+市
+布
+帅
+帆
+师
+希
+帏
+帐
+帑
+帔
+帕
+帖
+帘
+帙
+帚
+帛
+帜
+帝
+带
+帧
+席
+帮
+帯
+帰
+帷
+常
+帻
+帼
+帽
+幂
+幄
+幅
+幌
+幔
+幕
+幛
+幞
+幡
+幢
+干
+平
+年
+幵
+并
+幷
+幸
+幺
+幻
+幼
+幽
+广
+庀
+庁
+広
+庄
+庆
+庇
+床
+庋
+序
+庐
+庑
+库
+应
+底
+庖
+店
+庙
+庚
+府
+庞
+废
+庠
+庤
+庥
+度
+座
+庭
+庵
+庶
+康
+庸
+庹
+庾
+廆
+廉
+廊
+廋
+廌
+廑
+廒
+廓
+廕
+廖
+廙
+廛
+廞
+廨
+廪
+廯
+延
+廷
+建
+廻
+廼
+廾
+廿
+开
+弁
+异
+弃
+弄
+弇
+弈
+弊
+弋
+式
+弐
+弑
+弓
+引
+弗
+弘
+弛
+弟
+张
+弢
+弥
+弦
+弧
+弩
+弭
+弯
+弱
+弹
+强
+弼
+弾
+彀
+归
+当
+录
+彖
+彗
+彘
+彝
+彟
+彡
+形
+彤
+彦
+彧
+彩
+彪
+彬
+彭
+彰
+影
+彳
+彵
+彷
+役
+彻
+彼
+往
+征
+徂
+径
+待
+徇
+很
+徉
+徊
+律
+徐
+徒
+従
+徕
+得
+徘
+徙
+徜
+御
+徧
+徨
+循
+徭
+微
+徳
+徴
+徵
+德
+徼
+徽
+心
+忄
+必
+忆
+忉
+忌
+忍
+忏
+忐
+忑
+忒
+忖
+志
+忘
+忙
+応
+忝
+忞
+忠
+忡
+忤
+忧
+忪
+快
+忭
+忱
+念
+忸
+忻
+忽
+忾
+忿
+怀
+态
+怂
+怃
+怄
+怅
+怆
+怍
+怎
+怏
+怒
+怔
+怕
+怖
+怙
+怛
+怜
+思
+怠
+怡
+急
+怦
+性
+怨
+怩
+怪
+怫
+怯
+怱
+怳
+怵
+怹
+总
+怼
+怿
+恁
+恂
+恃
+恋
+恍
+恏
+恐
+恒
+恕
+恙
+恚
+恠
+恢
+恣
+恤
+恨
+恩
+恪
+恫
+恬
+恭
+息
+恰
+恳
+恵
+恶
+恸
+恹
+恺
+恻
+恼
+恽
+恿
+悃
+悄
+悉
+悌
+悍
+悒
+悔
+悖
+悚
+悛
+悝
+悟
+悠
+患
+悦
+您
+悩
+悪
+悫
+悬
+悭
+悯
+悰
+悱
+悲
+悳
+悴
+悸
+悻
+悼
+情
+惆
+惇
+惊
+惋
+惑
+惔
+惕
+惘
+惚
+惛
+惜
+惝
+惟
+惠
+惢
+惣
+惦
+惧
+惨
+惩
+惪
+惫
+惬
+惭
+惮
+惯
+惰
+想
+惴
+惶
+惹
+惺
+愀
+愁
+愆
+愈
+愉
+愍
+愎
+意
+愔
+愕
+愚
+感
+愠
+愣
+愤
+愦
+愧
+愫
+愬
+愰
+愽
+愿
+慆
+慈
+慊
+慌
+慎
+慑
+慕
+慜
+慝
+慢
+慥
+慧
+慨
+慰
+慵
+慷
+慾
+憋
+憍
+憎
+憔
+憙
+憧
+憨
+憩
+憬
+憷
+憺
+憾
+懂
+懈
+懊
+懋
+懐
+懑
+懒
+懔
+懦
+懮
+懵
+懽
+懿
+戆
+戈
+戊
+戋
+戌
+戍
+戎
+戏
+成
+我
+戒
+戓
+戕
+或
+戗
+战
+戚
+戛
+戟
+戡
+戢
+戥
+戦
+截
+戬
+戮
+戯
+戳
+戴
+户
+戻
+戽
+戾
+房
+所
+扁
+扃
+扆
+扇
+扈
+扉
+手
+扌
+才
+扎
+扑
+扒
+打
+扔
+托
+扛
+扞
+扣
+扥
+扦
+执
+扩
+扪
+扫
+扬
+扭
+扮
+扯
+扰
+扳
+扶
+批
+扼
+扽
+找
+承
+技
+抃
+抄
+抉
+把
+抑
+抒
+抓
+抔
+投
+抖
+抗
+折
+抚
+抛
+抜
+抟
+抠
+抡
+抢
+护
+报
+抧
+抨
+披
+抬
+抱
+抳
+抵
+抹
+抺
+抻
+押
+抽
+抿
+拂
+拄
+担
+拆
+拇
+拈
+拉
+拊
+拌
+拍
+拎
+拏
+拐
+拒
+拓
+拔
+拖
+拗
+拘
+拙
+招
+拜
+拟
+拢
+拣
+拥
+拦
+拧
+拨
+择
+括
+拭
+拮
+拯
+拱
+拳
+拴
+拶
+拷
+拼
+拽
+拾
+拿
+挀
+持
+挂
+指
+挈
+按
+挎
+挑
+挒
+挖
+挚
+挛
+挝
+挞
+挟
+挠
+挡
+挢
+挣
+挤
+挥
+挨
+挪
+挫
+振
+挲
+挹
+挺
+挻
+挼
+挽
+捂
+捃
+捅
+捆
+捉
+捋
+捌
+捍
+捎
+捏
+捐
+捕
+捜
+捞
+损
+捡
+换
+捣
+捧
+捩
+捭
+据
+捯
+捱
+捶
+捷
+捺
+捻
+捽
+掀
+掂
+掇
+授
+掉
+掊
+掌
+掎
+掏
+掐
+排
+掖
+掘
+掞
+掠
+探
+掣
+掤
+接
+控
+推
+掩
+措
+掬
+掭
+掮
+掰
+掲
+掳
+掴
+掷
+掸
+掺
+掼
+掾
+揄
+揆
+揉
+揍
+描
+提
+插
+揖
+揠
+握
+揣
+揩
+揪
+揭
+揲
+援
+揵
+揶
+揸
+揺
+揽
+揾
+揿
+搀
+搁
+搂
+搅
+搋
+搏
+搐
+搓
+搔
+搜
+搞
+搠
+搡
+搢
+搥
+搦
+搧
+搨
+搪
+搬
+搭
+搴
+搵
+携
+搽
+搿
+摁
+摄
+摅
+摆
+摇
+摈
+摊
+摒
+摔
+摘
+摛
+摞
+摧
+摩
+摭
+摸
+摹
+摺
+摽
+撂
+撃
+撄
+撅
+撇
+撑
+撒
+撕
+撘
+撙
+撝
+撞
+撤
+撩
+撬
+播
+撮
+撰
+撵
+撷
+撸
+撺
+撼
+擀
+擂
+擅
+操
+擎
+擒
+擗
+擘
+擞
+擢
+擤
+擦
+擫
+擿
+攀
+攒
+攘
+攞
+攥
+攫
+支
+攲
+攴
+攵
+收
+攸
+改
+攻
+攽
+放
+政
+故
+效
+敉
+敌
+敎
+敏
+救
+敔
+敕
+敖
+教
+敚
+敛
+敝
+敞
+敢
+散
+敦
+敫
+敬
+数
+敲
+整
+敷
+敻
+文
+斉
+斋
+斌
+斎
+斐
+斑
+斓
+斗
+料
+斛
+斜
+斝
+斟
+斡
+斤
+斥
+斧
+斩
+斫
+断
+斯
+新
+斲
+斶
+方
+於
+施
+旁
+旃
+旄
+旅
+旆
+旋
+旌
+旎
+族
+旒
+旖
+旗
+旛
+无
+既
+旣
+日
+旦
+旧
+旨
+早
+旬
+旭
+旮
+旯
+旰
+旱
+旳
+旴
+时
+旷
+旸
+旺
+旻
+旼
+昀
+昂
+昃
+昆
+昇
+昉
+昊
+昌
+明
+昏
+易
+昔
+昕
+昙
+昚
+昝
+昞
+星
+映
+春
+昧
+昨
+昪
+昫
+昭
+是
+昰
+昱
+昳
+昴
+昵
+昶
+昺
+昼
+显
+晁
+時
+晃
+晄
+晋
+晌
+晏
+晒
+晓
+晔
+晕
+晖
+晗
+晙
+晚
+晞
+晟
+晡
+晢
+晤
+晦
+晧
+晨
+晩
+晬
+普
+景
+晰
+晳
+晴
+晶
+晷
+晸
+智
+晻
+晾
+暁
+暂
+暄
+暇
+暌
+暍
+暎
+暐
+暑
+暕
+暖
+暗
+暝
+暠
+暧
+暨
+暮
+暴
+暸
+暹
+暻
+暾
+曈
+曌
+曕
+曙
+曛
+曜
+曝
+曦
+曩
+曰
+曱
+曲
+曳
+更
+曷
+曹
+曺
+曼
+曽
+曾
+替
+最
+會
+朅
+月
+有
+朊
+朋
+服
+朏
+朐
+朓
+朔
+朕
+朗
+望
+朝
+期
+朥
+朦
+木
+未
+末
+本
+札
+术
+朱
+朲
+朴
+朵
+机
+朽
+朿
+杀
+杂
+权
+杆
+杈
+杉
+杌
+李
+杏
+材
+村
+杓
+杖
+杜
+杞
+束
+杠
+条
+来
+杧
+杨
+杪
+杭
+杮
+杯
+杰
+杲
+杳
+杵
+杷
+杼
+松
+板
+极
+构
+枇
+枉
+枋
+枏
+析
+枓
+枕
+林
+枘
+枚
+果
+枝
+枞
+枟
+枠
+枢
+枣
+枥
+枧
+枨
+枪
+枫
+枭
+枯
+枰
+枱
+枳
+枵
+架
+枷
+枸
+枹
+柁
+柃
+柄
+柊
+柏
+某
+柑
+柒
+染
+柔
+柘
+柙
+柚
+柜
+柝
+柞
+柟
+柠
+柢
+查
+柩
+柬
+柯
+柰
+柱
+柳
+柴
+柷
+査
+柽
+柾
+柿
+栀
+栃
+栄
+栅
+标
+栈
+栉
+栊
+栋
+栌
+栎
+栏
+树
+栒
+栓
+栖
+栗
+栝
+栞
+栟
+校
+栢
+栩
+株
+栱
+栲
+栳
+栴
+样
+核
+根
+栻
+格
+栽
+栾
+栿
+桀
+桁
+桂
+桃
+桄
+桅
+框
+案
+桉
+桌
+桎
+桐
+桑
+桓
+桔
+桕
+桖
+桜
+桠
+桡
+桢
+档
+桤
+桥
+桦
+桧
+桨
+桩
+桫
+桴
+桶
+桷
+梁
+梃
+梅
+梆
+梏
+梓
+梗
+梢
+梣
+梦
+梧
+梨
+梭
+梯
+械
+梳
+梵
+梶
+梼
+梿
+检
+棂
+棉
+棋
+棍
+棐
+棒
+棕
+棘
+棚
+棠
+棣
+棨
+棪
+棫
+森
+棰
+棱
+棵
+棹
+棺
+棻
+棼
+椀
+椁
+椅
+椇
+椋
+植
+椎
+椐
+椒
+椛
+検
+椟
+椠
+椤
+椪
+椭
+椰
+椴
+椹
+椽
+椿
+楂
+楔
+楗
+楙
+楚
+楛
+楝
+楞
+楠
+楢
+楣
+楤
+楦
+楩
+楪
+楫
+業
+楮
+楯
+楶
+楷
+楸
+楹
+楼
+楽
+榀
+概
+榃
+榄
+榆
+榇
+榈
+榉
+榊
+榎
+榔
+榕
+榖
+榘
+榛
+榜
+榧
+榨
+榫
+榭
+榱
+榴
+榷
+榻
+榼
+槁
+槃
+槅
+槊
+槌
+槎
+槐
+槑
+槔
+様
+槙
+槚
+槛
+槜
+槟
+槠
+槭
+槱
+槲
+槵
+槻
+槽
+槿
+樊
+樋
+樗
+樘
+樛
+樟
+模
+樨
+権
+横
+樫
+樯
+樱
+樵
+樽
+樾
+橄
+橇
+橐
+橘
+橙
+橚
+橛
+橡
+橥
+橦
+橱
+橹
+橼
+檀
+檄
+檇
+檎
+檐
+檗
+檞
+檠
+檩
+檫
+檬
+檵
+櫂
+櫆
+欠
+次
+欢
+欣
+欤
+欧
+欲
+欷
+欸
+欹
+欺
+欻
+款
+歃
+歆
+歇
+歉
+歌
+歔
+歘
+歙
+止
+正
+此
+步
+武
+歧
+歩
+歪
+歳
+歴
+歹
+歺
+死
+歼
+殁
+殂
+殃
+殄
+殆
+殇
+殉
+殊
+残
+殍
+殑
+殒
+殓
+殖
+殚
+殛
+殡
+殢
+殪
+殳
+殴
+段
+殷
+殿
+毁
+毂
+毅
+毋
+毌
+母
+毎
+每
+毐
+毑
+毒
+毓
+比
+毕
+毖
+毗
+毘
+毙
+毛
+毡
+毫
+毯
+毳
+毵
+毽
+氀
+氂
+氅
+氆
+氇
+氍
+氏
+氐
+民
+氓
+气
+氖
+気
+氘
+氙
+氚
+氛
+氟
+氡
+氢
+氤
+氦
+氧
+氨
+氩
+氪
+氮
+氯
+氰
+氲
+水
+氵
+氷
+永
+氹
+氺
+氽
+氿
+汀
+汁
+求
+汆
+汇
+汉
+汊
+汐
+汕
+汖
+汗
+汛
+汜
+汝
+汞
+江
+池
+污
+汤
+汧
+汨
+汩
+汪
+汭
+汰
+汲
+汴
+汶
+汸
+汹
+汽
+汾
+沁
+沂
+沃
+沄
+沅
+沆
+沇
+沈
+沉
+沌
+沏
+沐
+沒
+沓
+沔
+沕
+沙
+沚
+沛
+沟
+没
+沢
+沣
+沤
+沥
+沦
+沧
+沨
+沩
+沪
+沫
+沬
+沭
+沮
+沱
+河
+沴
+沵
+沸
+油
+治
+沼
+沽
+沾
+沿
+泃
+泄
+泅
+泇
+泉
+泊
+泌
+泐
+泓
+泔
+法
+泖
+泗
+泚
+泛
+泞
+泠
+泡
+波
+泣
+泥
+注
+泩
+泪
+泫
+泮
+泯
+泰
+泱
+泳
+泵
+泷
+泸
+泺
+泻
+泼
+泽
+泾
+洁
+洄
+洇
+洈
+洊
+洋
+洌
+洎
+洑
+洒
+洗
+洙
+洛
+洞
+洢
+洣
+津
+洧
+洨
+洪
+洫
+洮
+洱
+洲
+洳
+洵
+洸
+洹
+洺
+活
+洼
+洽
+派
+流
+浃
+浄
+浅
+浆
+浇
+浈
+浉
+浊
+测
+浍
+济
+浏
+浐
+浑
+浒
+浓
+浔
+浙
+浚
+浛
+浜
+浞
+浠
+浡
+浣
+浥
+浦
+浩
+浪
+浬
+浮
+浯
+浴
+海
+浸
+浼
+涂
+涅
+消
+涉
+涌
+涎
+涐
+涑
+涓
+涔
+涕
+涘
+涙
+涛
+涝
+涞
+涟
+涠
+涡
+涢
+涣
+涤
+润
+涧
+涨
+涩
+涪
+涫
+涮
+涯
+液
+涴
+涵
+涸
+涿
+淀
+淄
+淅
+淆
+淇
+淋
+淌
+淏
+淑
+淖
+淘
+淙
+淛
+淝
+淞
+淠
+淡
+淤
+淦
+淩
+淫
+淬
+淮
+淯
+深
+淳
+混
+淸
+淹
+添
+淼
+渀
+渃
+清
+済
+渉
+渊
+渋
+渌
+渍
+渎
+渐
+渑
+渔
+渕
+渖
+渗
+渚
+渝
+渟
+渠
+渡
+渣
+渤
+渥
+温
+渫
+渭
+港
+渲
+渴
+游
+渺
+渼
+湃
+湄
+湉
+湋
+湍
+湎
+湑
+湓
+湔
+湖
+湘
+湛
+湜
+湟
+湣
+湫
+湮
+湲
+湳
+湴
+湾
+湿
+満
+溁
+溃
+溅
+溆
+溇
+溉
+溍
+溏
+源
+溘
+溜
+溞
+溟
+溢
+溥
+溦
+溧
+溪
+溯
+溱
+溲
+溴
+溶
+溷
+溺
+溽
+滁
+滂
+滃
+滆
+滇
+滈
+滉
+滋
+滍
+滏
+滑
+滓
+滔
+滕
+滗
+滘
+滙
+滚
+滝
+滞
+滟
+滠
+满
+滢
+滤
+滥
+滦
+滨
+滩
+滴
+滹
+漀
+漂
+漆
+漈
+漉
+漏
+漓
+演
+漕
+漠
+漩
+漪
+漫
+漭
+漯
+漱
+漳
+漶
+漷
+漾
+潆
+潇
+潋
+潍
+潏
+潘
+潜
+潞
+潟
+潢
+潦
+潭
+潮
+潲
+潴
+潸
+潺
+潼
+潽
+潾
+澂
+澄
+澈
+澉
+澌
+澍
+澎
+澐
+澔
+澜
+澡
+澥
+澧
+澪
+澳
+澶
+澹
+激
+濂
+濆
+濉
+濊
+濑
+濒
+濙
+濛
+濞
+濠
+濡
+濩
+濬
+濮
+濯
+瀀
+瀍
+瀑
+瀚
+瀛
+瀞
+瀣
+瀬
+瀹
+瀼
+灌
+灏
+灞
+火
+灬
+灭
+灯
+灰
+灵
+灶
+灸
+灼
+灾
+灿
+炀
+炁
+炅
+炆
+炉
+炊
+炎
+炒
+炔
+炕
+炖
+炘
+炙
+炜
+炝
+炟
+炤
+炩
+炫
+炬
+炭
+炮
+炯
+炱
+炳
+炷
+炸
+点
+為
+炻
+炼
+炽
+烀
+烁
+烂
+烃
+烈
+烊
+烎
+烔
+烘
+烙
+烛
+烜
+烝
+烟
+烤
+烦
+烧
+烨
+烩
+烫
+烬
+热
+烯
+烷
+烹
+烺
+烽
+焉
+焊
+焌
+焐
+焓
+焕
+焖
+焗
+焘
+焙
+焚
+焜
+焞
+焦
+焮
+焯
+焰
+焱
+然
+焼
+煅
+煇
+煊
+煌
+煎
+煐
+煕
+煖
+煚
+煜
+煞
+煤
+煦
+照
+煨
+煮
+煲
+煳
+煴
+煸
+煺
+煽
+熄
+熇
+熊
+熏
+熔
+熘
+熙
+熜
+熟
+熠
+熥
+熨
+熬
+熳
+熵
+熹
+熺
+燃
+燊
+燋
+燎
+燏
+燔
+燕
+燚
+燠
+燥
+燧
+燮
+燹
+燻
+燿
+爀
+爆
+爇
+爨
+爪
+爬
+爰
+爱
+爲
+爵
+父
+爷
+爸
+爹
+爻
+爽
+爿
+牀
+牁
+牂
+片
+版
+牋
+牌
+牍
+牐
+牒
+牖
+牙
+牛
+牝
+牟
+牠
+牡
+牢
+牤
+牦
+牧
+物
+牯
+牲
+牵
+特
+牺
+牻
+牾
+犀
+犁
+犄
+犇
+犊
+犍
+犏
+犒
+犟
+犨
+犬
+犭
+犯
+犰
+犴
+状
+犷
+犸
+犹
+犼
+犽
+狁
+狂
+狃
+狄
+狈
+狌
+狍
+狎
+狐
+狒
+狗
+狙
+狛
+狝
+狞
+狠
+狡
+狨
+狩
+独
+狭
+狮
+狯
+狰
+狱
+狲
+狳
+狴
+狷
+狸
+狻
+狼
+猀
+猁
+猃
+猄
+猇
+猊
+猋
+猎
+猓
+猕
+猖
+猗
+猛
+猜
+猝
+猞
+猟
+猡
+猢
+猥
+猩
+猪
+猫
+猬
+献
+猰
+猱
+猴
+猷
+猹
+猾
+猿
+獍
+獐
+獒
+獗
+獠
+獣
+獬
+獭
+獴
+獾
+玁
+玄
+率
+玉
+王
+玎
+玏
+玑
+玕
+玖
+玗
+玘
+玙
+玚
+玛
+玟
+玠
+玡
+玢
+玥
+玦
+玧
+玩
+玫
+玭
+玮
+环
+现
+玲
+玳
+玷
+玹
+玺
+玻
+珀
+珂
+珅
+珈
+珉
+珊
+珍
+珏
+珐
+珑
+珖
+珙
+珝
+珞
+珠
+珣
+珥
+珦
+珧
+珩
+珪
+班
+珰
+珲
+珵
+珹
+珺
+珽
+琀
+球
+琅
+理
+琇
+琉
+琊
+琋
+琍
+琎
+琏
+琐
+琚
+琛
+琢
+琤
+琥
+琦
+琨
+琪
+琬
+琮
+琯
+琰
+琲
+琳
+琴
+琵
+琶
+琹
+琼
+瑀
+瑁
+瑄
+瑆
+瑊
+瑒
+瑕
+瑗
+瑙
+瑚
+瑛
+瑜
+瑞
+瑟
+瑠
+瑢
+瑧
+瑨
+瑭
+瑰
+瑱
+瑶
+瑷
+瑸
+瑺
+瑾
+璀
+璁
+璂
+璃
+璆
+璇
+璈
+璋
+璎
+璐
+璘
+璜
+璞
+璟
+璠
+璧
+璨
+璩
+璪
+璮
+璲
+璺
+璿
+瓌
+瓒
+瓘
+瓛
+瓜
+瓞
+瓟
+瓠
+瓢
+瓣
+瓤
+瓦
+瓮
+瓯
+瓴
+瓶
+瓷
+瓿
+甃
+甄
+甍
+甏
+甑
+甓
+甗
+甘
+甙
+甚
+甜
+生
+甡
+產
+甥
+用
+甩
+甪
+甫
+甬
+甭
+甯
+田
+由
+甲
+申
+电
+男
+甸
+町
+画
+甾
+畀
+畅
+畈
+畊
+畋
+界
+畎
+畏
+畑
+畔
+留
+畚
+畛
+畜
+畠
+畤
+略
+畦
+番
+畯
+畲
+畴
+畸
+畹
+畿
+疃
+疆
+疋
+疍
+疎
+疏
+疑
+疒
+疔
+疖
+疗
+疙
+疚
+疝
+疟
+疠
+疡
+疣
+疤
+疥
+疫
+疬
+疮
+疯
+疰
+疱
+疲
+疳
+疴
+疵
+疸
+疹
+疼
+疽
+疾
+痂
+痄
+病
+症
+痈
+痉
+痊
+痍
+痒
+痔
+痕
+痖
+痘
+痛
+痞
+痢
+痣
+痤
+痦
+痧
+痨
+痩
+痪
+痫
+痰
+痱
+痴
+痹
+痼
+痿
+瘀
+瘁
+瘅
+瘆
+瘊
+瘌
+瘐
+瘕
+瘖
+瘗
+瘘
+瘙
+瘛
+瘟
+瘠
+瘢
+瘤
+瘥
+瘦
+瘩
+瘪
+瘫
+瘰
+瘳
+瘴
+瘵
+瘸
+瘼
+瘾
+瘿
+癀
+癃
+癌
+癍
+癎
+癒
+癔
+癖
+癜
+癞
+癣
+癫
+癯
+癸
+発
+登
+發
+白
+百
+癿
+皂
+的
+皆
+皇
+皈
+皋
+皎
+皐
+皑
+皒
+皓
+皕
+皖
+皙
+皛
+皝
+皞
+皤
+皦
+皮
+皱
+皲
+皴
+皿
+盂
+盃
+盅
+盆
+盈
+盉
+益
+盌
+盍
+盎
+盏
+盐
+监
+盒
+盔
+盖
+盗
+盘
+盛
+盝
+盟
+盥
+盦
+盨
+盩
+目
+盯
+盱
+盲
+直
+相
+盹
+盼
+盾
+眀
+省
+眄
+眇
+眈
+眉
+看
+県
+眙
+眚
+眛
+眞
+真
+眠
+眦
+眨
+眩
+眬
+眭
+眯
+眵
+眶
+眷
+眸
+眺
+眼
+着
+睁
+睇
+睐
+睑
+睒
+睚
+睛
+睡
+睢
+督
+睥
+睦
+睨
+睪
+睫
+睬
+睱
+睹
+睺
+睽
+睾
+睿
+瞀
+瞄
+瞅
+瞋
+瞌
+瞎
+瞑
+瞒
+瞟
+瞠
+瞢
+瞥
+瞧
+瞩
+瞪
+瞬
+瞭
+瞰
+瞳
+瞻
+瞽
+瞿
+矍
+矗
+矛
+矜
+矞
+矢
+矣
+知
+矧
+矩
+矫
+矬
+短
+矮
+石
+矶
+矸
+矽
+矾
+矿
+砀
+码
+砂
+砉
+砌
+砍
+砑
+砒
+研
+砕
+砖
+砗
+砚
+砜
+砝
+砟
+砢
+砣
+砥
+砦
+砧
+砩
+砬
+砭
+砰
+砳
+破
+砵
+砷
+砸
+砹
+砺
+砻
+砼
+砾
+础
+硅
+硇
+硌
+硎
+硏
+硐
+硒
+硔
+硕
+硖
+硗
+硙
+硚
+硝
+硪
+硫
+硬
+确
+硷
+硼
+碁
+碇
+碉
+碌
+碍
+碎
+碏
+碑
+碓
+碗
+碘
+碚
+碛
+碜
+碟
+碡
+碣
+碥
+碧
+碰
+碱
+碲
+碳
+碴
+碶
+碹
+碾
+磁
+磅
+磉
+磊
+磋
+磐
+磔
+磕
+磙
+磜
+磡
+磦
+磨
+磬
+磲
+磴
+磷
+磺
+磻
+磾
+礁
+礅
+礐
+礓
+礞
+礤
+礴
+示
+礻
+礼
+礽
+社
+祀
+祁
+祂
+祆
+祇
+祈
+祉
+祊
+祎
+祏
+祐
+祓
+祔
+祖
+祗
+祘
+祚
+祛
+祜
+祝
+神
+祟
+祠
+祢
+祥
+祧
+票
+祭
+祯
+祲
+祷
+祸
+祹
+祺
+祼
+祾
+禀
+禁
+禄
+禅
+禇
+禊
+禋
+福
+禑
+禔
+禖
+禘
+禚
+禛
+禟
+禤
+禧
+禩
+禳
+禵
+禹
+禺
+离
+禽
+禾
+秀
+私
+秂
+秃
+秆
+秉
+秋
+种
+科
+秒
+秕
+秘
+租
+秣
+秤
+秦
+秧
+秩
+秫
+秬
+秭
+积
+称
+秸
+移
+秽
+秾
+稀
+稃
+程
+稍
+税
+稔
+稗
+稙
+稚
+稞
+稠
+稣
+稲
+稳
+稷
+稹
+稻
+稼
+稽
+稿
+穂
+穆
+穉
+穏
+穑
+穗
+穣
+穰
+穴
+究
+穷
+穹
+空
+穿
+突
+窃
+窄
+窅
+窆
+窈
+窊
+窋
+窍
+窑
+窒
+窓
+窕
+窖
+窗
+窘
+窜
+窝
+窟
+窠
+窣
+窥
+窦
+窨
+窭
+窰
+窳
+窸
+窿
+立
+竑
+竖
+站
+竜
+竝
+竞
+竟
+章
+竣
+童
+竦
+竭
+端
+竹
+竺
+竽
+竿
+笃
+笄
+笆
+笈
+笊
+笋
+笏
+笑
+笔
+笕
+笙
+笛
+笞
+笠
+笤
+笥
+符
+笨
+笪
+笫
+第
+笮
+笱
+笳
+笸
+笹
+笺
+笼
+笾
+筅
+筇
+等
+筊
+筋
+筌
+筏
+筐
+筑
+筒
+答
+策
+筘
+筚
+筛
+筜
+筝
+筠
+筭
+筮
+筯
+筱
+筲
+筴
+筵
+筷
+筹
+筼
+签
+简
+箅
+箌
+箍
+箐
+箓
+箔
+箕
+算
+箜
+箝
+管
+箦
+箧
+箨
+箩
+箪
+箫
+箬
+箭
+箱
+箴
+箸
+篁
+篆
+篇
+篌
+篑
+篓
+篙
+篚
+篝
+篡
+篥
+篦
+篪
+篮
+篯
+篱
+篷
+篼
+篾
+簃
+簇
+簋
+簌
+簏
+簕
+簖
+簟
+簠
+簦
+簧
+簪
+簰
+簸
+簿
+籀
+籁
+籍
+籓
+籙
+米
+籴
+籺
+类
+籼
+籽
+粄
+粉
+粑
+粒
+粕
+粗
+粘
+粜
+粝
+粞
+粟
+粢
+粤
+粥
+粦
+粧
+粪
+粬
+粮
+粱
+粲
+粳
+粹
+粼
+粽
+精
+粿
+糁
+糅
+糊
+糌
+糍
+糕
+糖
+糗
+糙
+糜
+糟
+糠
+糨
+糬
+糯
+糸
+系
+紊
+紘
+素
+索
+紧
+紫
+紬
+紮
+累
+経
+絜
+絪
+絮
+絵
+絶
+絷
+絺
+綎
+綖
+継
+続
+綝
+綦
+綫
+綮
+総
+緑
+緾
+縁
+縂
+縠
+縢
+縻
+繁
+繇
+繋
+繸
+繻
+纁
+纂
+纚
+纛
+纟
+纠
+纡
+红
+纣
+纤
+纥
+约
+级
+纨
+纩
+纪
+纫
+纬
+纭
+纮
+纯
+纰
+纱
+纲
+纳
+纵
+纶
+纷
+纸
+纹
+纺
+纻
+纽
+纾
+线
+绀
+绁
+绂
+练
+组
+绅
+细
+织
+终
+绉
+绊
+绌
+绍
+绎
+经
+绐
+绑
+绒
+结
+绔
+绕
+绗
+绘
+给
+绚
+绛
+络
+绝
+绞
+统
+绠
+绡
+绢
+绣
+绥
+绦
+继
+绨
+绩
+绪
+绫
+续
+绮
+绯
+绰
+绱
+绲
+绳
+维
+绵
+绶
+绷
+绸
+绹
+绺
+绻
+综
+绽
+绾
+绿
+缀
+缁
+缂
+缃
+缄
+缅
+缆
+缇
+缈
+缉
+缊
+缋
+缌
+缍
+缎
+缐
+缑
+缒
+缓
+缔
+缕
+编
+缗
+缘
+缙
+缚
+缛
+缜
+缝
+缟
+缠
+缡
+缢
+缣
+缤
+缥
+缦
+缧
+缨
+缩
+缪
+缫
+缬
+缭
+缮
+缯
+缰
+缱
+缲
+缳
+缴
+缵
+缶
+缷
+缸
+缺
+罂
+罃
+罄
+罅
+罍
+罐
+网
+罔
+罕
+罗
+罘
+罚
+罝
+罟
+罠
+罡
+罢
+罣
+罥
+罨
+罩
+罪
+置
+罱
+署
+罴
+罹
+罽
+罾
+羁
+羊
+羌
+美
+羑
+羔
+羕
+羚
+羝
+羞
+羟
+羡
+羣
+群
+羧
+羮
+羯
+羰
+羲
+羸
+羹
+羼
+羽
+羿
+翀
+翁
+翃
+翅
+翈
+翊
+翌
+翎
+翔
+翕
+翘
+翙
+翚
+翛
+翟
+翠
+翡
+翥
+翦
+翩
+翫
+翮
+翰
+翱
+翳
+翻
+翼
+翾
+耀
+老
+考
+耄
+者
+耆
+耋
+而
+耍
+耐
+耒
+耔
+耕
+耗
+耘
+耙
+耜
+耦
+耧
+耨
+耩
+耪
+耳
+耵
+耶
+耷
+耸
+耻
+耽
+耿
+聂
+聃
+聆
+聊
+聋
+职
+聍
+聒
+联
+聕
+聘
+聚
+聡
+聩
+聪
+聴
+聼
+聿
+肃
+肄
+肆
+肇
+肉
+肋
+肌
+肓
+肖
+肘
+肚
+肛
+肜
+肝
+肟
+肠
+股
+肢
+肤
+肥
+肩
+肪
+肫
+肮
+肯
+肱
+育
+肴
+肸
+肺
+肼
+肽
+肾
+肿
+胀
+胁
+胂
+胃
+胄
+胆
+背
+胍
+胎
+胖
+胗
+胙
+胚
+胛
+胜
+胝
+胞
+胠
+胡
+胤
+胥
+胧
+胨
+胪
+胫
+胬
+胭
+胯
+胰
+胱
+胳
+胴
+胶
+胸
+胺
+胼
+能
+脁
+脂
+脆
+脇
+脉
+脊
+脍
+脏
+脐
+脑
+脒
+脓
+脔
+脖
+脘
+脚
+脞
+脢
+脩
+脬
+脯
+脱
+脲
+脳
+脷
+脸
+脾
+脿
+腆
+腈
+腊
+腋
+腌
+腐
+腑
+腓
+腔
+腕
+腘
+腙
+腚
+腠
+腥
+腧
+腩
+腭
+腮
+腰
+腱
+腴
+腹
+腺
+腻
+腼
+腾
+腿
+膀
+膂
+膈
+膊
+膏
+膑
+膘
+膛
+膜
+膝
+膦
+膨
+膳
+膺
+膻
+臀
+臁
+臂
+臃
+臆
+臊
+臌
+臑
+臓
+臜
+臞
+臣
+臧
+自
+臬
+臭
+至
+致
+臵
+臻
+臼
+臾
+舀
+舁
+舂
+舄
+舅
+舆
+舌
+舍
+舎
+舐
+舒
+舔
+舖
+舘
+舛
+舜
+舞
+舟
+舡
+舢
+舣
+舨
+航
+舫
+般
+舯
+舰
+舱
+舲
+舳
+舴
+舵
+舶
+舷
+舸
+船
+舺
+舻
+舾
+艄
+艇
+艉
+艋
+艏
+艘
+艟
+艨
+艮
+良
+艰
+色
+艳
+艹
+艺
+艽
+艾
+艿
+节
+芃
+芄
+芈
+芊
+芋
+芍
+芎
+芏
+芑
+芒
+芗
+芘
+芙
+芜
+芝
+芟
+芡
+芣
+芤
+芥
+芦
+芨
+芩
+芪
+芫
+芬
+芭
+芮
+芯
+芰
+花
+芳
+芴
+芶
+芷
+芸
+芹
+芽
+芾
+苁
+苄
+苇
+苈
+苊
+苋
+苌
+苍
+苎
+苏
+苑
+苒
+苓
+苔
+苕
+苗
+苘
+苛
+苜
+苞
+苟
+苡
+苢
+苣
+苤
+若
+苦
+苫
+苭
+苯
+英
+苴
+苷
+苹
+苺
+苻
+苼
+苾
+茀
+茁
+茂
+范
+茄
+茅
+茆
+茇
+茈
+茉
+茌
+茎
+茏
+茑
+茔
+茕
+茗
+茚
+茛
+茜
+茝
+茧
+茨
+茫
+茬
+茭
+茯
+茱
+茳
+茴
+茵
+茶
+茸
+茹
+茺
+茼
+荀
+荃
+荄
+荅
+荆
+荇
+荈
+草
+荏
+荐
+荑
+荒
+荔
+荖
+荘
+荚
+荛
+荜
+荞
+荟
+荠
+荡
+荣
+荤
+荥
+荦
+荧
+荨
+荩
+荪
+荫
+荬
+荭
+荮
+药
+荳
+荷
+荸
+荻
+荼
+荽
+莃
+莅
+莆
+莉
+莎
+莒
+莓
+莘
+莙
+莛
+莜
+莞
+莠
+莨
+莩
+莪
+莫
+莱
+莲
+莳
+莴
+莶
+获
+莸
+莹
+莺
+莼
+莽
+菀
+菁
+菂
+菅
+菇
+菈
+菉
+菊
+菌
+菏
+菑
+菓
+菔
+菖
+菘
+菜
+菝
+菟
+菠
+菡
+菩
+菪
+菫
+菰
+菱
+菲
+菸
+菹
+菽
+菿
+萁
+萃
+萄
+萆
+萋
+萌
+萍
+萎
+萏
+萑
+萘
+萜
+萝
+萢
+萤
+营
+萦
+萧
+萨
+萩
+萱
+萸
+萼
+落
+葆
+葎
+葑
+葖
+著
+葙
+葚
+葛
+葜
+葡
+葢
+董
+葩
+葫
+葬
+葭
+葱
+葳
+葵
+葶
+葸
+葺
+蒂
+蒇
+蒉
+蒋
+蒌
+蒍
+蒎
+蒐
+蒗
+蒙
+蒜
+蒟
+蒡
+蒨
+蒯
+蒲
+蒴
+蒸
+蒹
+蒺
+蒻
+蒽
+蒾
+蒿
+蓁
+蓂
+蓄
+蓇
+蓉
+蓊
+蓍
+蓐
+蓑
+蓓
+蓖
+蓝
+蓟
+蓠
+蓢
+蓣
+蓥
+蓦
+蓬
+蓼
+蓿
+蔀
+蔌
+蔑
+蔓
+蔗
+蔘
+蔚
+蔟
+蔡
+蔫
+蔬
+蔴
+蔵
+蔷
+蔸
+蔹
+蔺
+蔻
+蔼
+蔽
+蕃
+蕅
+蕈
+蕉
+蕊
+蕖
+蕗
+蕙
+蕞
+蕡
+蕤
+蕨
+蕰
+蕲
+蕴
+蕹
+蕺
+蕻
+蕾
+薄
+薅
+薆
+薇
+薏
+薙
+薛
+薜
+薢
+薤
+薨
+薪
+薫
+薬
+薮
+薯
+薰
+薷
+薹
+藁
+藉
+藏
+藐
+藓
+藕
+藜
+藟
+藠
+藤
+藦
+藨
+藩
+藻
+藿
+蘅
+蘑
+蘖
+蘧
+蘩
+蘸
+蘼
+虎
+虏
+虐
+虑
+虒
+虓
+虔
+虚
+虞
+虢
+虫
+虬
+虮
+虱
+虹
+虺
+虻
+虽
+虾
+虿
+蚀
+蚁
+蚂
+蚊
+蚋
+蚌
+蚍
+蚓
+蚕
+蚖
+蚜
+蚝
+蚡
+蚣
+蚤
+蚧
+蚨
+蚩
+蚪
+蚬
+蚯
+蚰
+蚱
+蚴
+蚵
+蚶
+蚺
+蛀
+蛄
+蛆
+蛇
+蛉
+蛊
+蛋
+蛎
+蛏
+蛐
+蛔
+蛙
+蛛
+蛞
+蛟
+蛤
+蛩
+蛭
+蛮
+蛰
+蛱
+蛲
+蛳
+蛴
+蛸
+蛹
+蛾
+蜀
+蜂
+蜃
+蜇
+蜈
+蜉
+蜊
+蜍
+蜑
+蜒
+蜓
+蜕
+蜗
+蜘
+蜚
+蜜
+蜞
+蜡
+蜢
+蜣
+蜥
+蜩
+蜮
+蜱
+蜴
+蜷
+蜻
+蜾
+蜿
+蝇
+蝈
+蝉
+蝌
+蝎
+蝓
+蝗
+蝙
+蝠
+蝣
+蝤
+蝥
+蝮
+蝰
+蝲
+蝴
+蝶
+蝻
+蝼
+蝽
+蝾
+螂
+螃
+螅
+螈
+螋
+融
+螓
+螟
+螣
+螨
+螫
+螬
+螭
+螯
+螳
+螵
+螺
+螽
+蟀
+蟆
+蟊
+蟋
+蟌
+蟑
+蟒
+蟛
+蟜
+蟠
+蟥
+蟪
+蟮
+蟳
+蟹
+蟾
+蠃
+蠊
+蠋
+蠍
+蠓
+蠕
+蠖
+蠡
+蠢
+蠲
+蠹
+蠼
+血
+衄
+衅
+衆
+行
+衍
+衎
+衔
+街
+衙
+衞
+衡
+衢
+衣
+补
+表
+衩
+衪
+衫
+衬
+衮
+衰
+衲
+衷
+衽
+衾
+衿
+袁
+袂
+袄
+袅
+袆
+袈
+袋
+袍
+袒
+袓
+袖
+袛
+袜
+袢
+袤
+袪
+被
+袭
+袮
+袱
+袴
+袷
+袼
+裁
+裂
+装
+裆
+裈
+裉
+裒
+裔
+裕
+裘
+裙
+裛
+裟
+裡
+裢
+裤
+裥
+裨
+裪
+裱
+裳
+裴
+裸
+裹
+裼
+裾
+褀
+褂
+褆
+褊
+褐
+褒
+褓
+褔
+褙
+褚
+褛
+褡
+褥
+褪
+褫
+褰
+褴
+褶
+襁
+襄
+襌
+襕
+襜
+襞
+襟
+襦
+襻
+西
+要
+覃
+覆
+覇
+覚
+覧
+覩
+観
+见
+观
+规
+觅
+视
+觇
+览
+觉
+觊
+觋
+觌
+觎
+觏
+觐
+觑
+角
+觚
+觜
+觞
+解
+觥
+触
+觯
+觳
+觽
+觿
+言
+訇
+訏
+訚
+訫
+訳
+訾
+詈
+詜
+詝
+詧
+詹
+誉
+誊
+誐
+誓
+説
+読
+諡
+諲
+諴
+謇
+謦
+譞
+警
+譬
+譲
+讌
+讠
+计
+订
+讣
+认
+讥
+讦
+讧
+讨
+让
+讪
+讫
+讬
+训
+议
+讯
+记
+讲
+讳
+讴
+讵
+讶
+讷
+许
+讹
+论
+讼
+讽
+设
+访
+诀
+证
+诂
+诃
+评
+诅
+识
+诈
+诉
+诊
+诋
+诌
+词
+诎
+诏
+译
+诒
+诓
+诔
+试
+诖
+诗
+诘
+诙
+诚
+诛
+诜
+话
+诞
+诟
+诠
+诡
+询
+诣
+诤
+该
+详
+诧
+诨
+诩
+诫
+诬
+语
+诮
+误
+诰
+诱
+诲
+诳
+说
+诵
+诶
+请
+诸
+诹
+诺
+读
+诼
+诽
+课
+诿
+谀
+谁
+谂
+调
+谄
+谅
+谆
+谇
+谈
+谊
+谋
+谌
+谍
+谎
+谏
+谐
+谑
+谒
+谓
+谔
+谕
+谖
+谗
+谘
+谙
+谚
+谛
+谜
+谝
+谞
+谟
+谠
+谡
+谢
+谣
+谤
+谥
+谦
+谧
+谨
+谩
+谪
+谫
+谬
+谭
+谮
+谯
+谰
+谱
+谲
+谳
+谴
+谵
+谶
+谷
+谿
+豁
+豆
+豇
+豉
+豊
+豌
+豕
+豚
+象
+豢
+豨
+豪
+豫
+豳
+豸
+豹
+豺
+貂
+貅
+貉
+貊
+貌
+貐
+貔
+貘
+賨
+賸
+贇
+贝
+贞
+负
+贠
+贡
+财
+责
+贤
+败
+账
+货
+质
+贩
+贪
+贫
+贬
+购
+贮
+贯
+贰
+贱
+贲
+贳
+贴
+贵
+贶
+贷
+贸
+费
+贺
+贻
+贼
+贽
+贾
+贿
+赀
+赁
+赂
+赃
+资
+赅
+赈
+赉
+赊
+赋
+赌
+赍
+赎
+赏
+赐
+赑
+赓
+赔
+赕
+赖
+赘
+赙
+赚
+赛
+赜
+赝
+赞
+赟
+赠
+赡
+赢
+赣
+赤
+赦
+赧
+赪
+赫
+赭
+赮
+走
+赳
+赴
+赵
+赶
+起
+趁
+趄
+超
+越
+趋
+趔
+趟
+趣
+趯
+趱
+足
+趴
+趵
+趸
+趺
+趼
+趾
+趿
+跂
+跃
+跄
+跆
+跋
+跌
+跎
+跏
+跑
+跖
+跗
+跚
+跛
+距
+跞
+跟
+跢
+跣
+跤
+跨
+跩
+跪
+跫
+跬
+路
+跳
+践
+跶
+跷
+跸
+跹
+跺
+跻
+跽
+踅
+踉
+踊
+踌
+踏
+踔
+踝
+踞
+踟
+踢
+踣
+踩
+踪
+踬
+踮
+踯
+踰
+踱
+踵
+踹
+踽
+蹀
+蹁
+蹂
+蹄
+蹇
+蹈
+蹉
+蹊
+蹋
+蹑
+蹒
+蹓
+蹙
+蹚
+蹟
+蹦
+蹩
+蹬
+蹭
+蹰
+蹲
+蹴
+蹶
+蹻
+蹼
+蹿
+躁
+躄
+躅
+躇
+躏
+躐
+躔
+躜
+躞
+身
+躬
+躯
+躲
+躺
+転
+軽
+輋
+轘
+车
+轧
+轨
+轩
+轫
+转
+轭
+轮
+软
+轰
+轱
+轲
+轳
+轴
+轵
+轶
+轸
+轹
+轺
+轻
+轼
+载
+轾
+轿
+辂
+较
+辄
+辅
+辆
+辇
+辈
+辉
+辊
+辋
+辍
+辎
+辏
+辐
+辑
+输
+辔
+辕
+辖
+辗
+辘
+辙
+辚
+辛
+辜
+辞
+辟
+辣
+辨
+辩
+辫
+辰
+辱
+辶
+边
+辺
+辻
+込
+辽
+达
+辿
+迁
+迂
+迄
+迅
+过
+迈
+迍
+迎
+运
+近
+迓
+返
+迕
+还
+这
+进
+远
+违
+连
+迟
+迢
+迤
+迥
+迦
+迨
+迩
+迪
+迫
+迭
+迮
+述
+迳
+迷
+迸
+迹
+追
+退
+送
+适
+逃
+逄
+逅
+逆
+选
+逊
+逋
+逍
+透
+逐
+逑
+递
+途
+逖
+逗
+這
+通
+逛
+逝
+逞
+速
+造
+逡
+逢
+逦
+逭
+逮
+逯
+進
+逵
+逶
+逸
+逹
+逺
+逻
+逼
+逾
+遁
+遂
+遄
+遇
+遍
+遏
+遐
+遑
+遒
+道
+遗
+遘
+遛
+遢
+遣
+遥
+遨
+遭
+遮
+遯
+遴
+遵
+遶
+遹
+遽
+避
+邀
+邂
+邃
+邅
+邈
+邉
+邋
+邑
+邓
+邕
+邗
+邙
+邛
+邝
+邠
+邡
+邢
+那
+邦
+邨
+邪
+邬
+邮
+邯
+邰
+邱
+邳
+邴
+邵
+邶
+邸
+邹
+邺
+邻
+邽
+邾
+郁
+郄
+郅
+郇
+郈
+郊
+郎
+郏
+郐
+郑
+郓
+郕
+郗
+郚
+郛
+郜
+郝
+郞
+郡
+郢
+郤
+郦
+郧
+部
+郪
+郫
+郭
+郯
+郴
+郷
+郸
+都
+郾
+郿
+鄀
+鄂
+鄄
+鄗
+鄘
+鄙
+鄚
+鄜
+鄞
+鄠
+鄢
+鄣
+鄩
+鄫
+鄮
+鄯
+鄱
+鄹
+酂
+酃
+酆
+酉
+酊
+酋
+酌
+配
+酎
+酐
+酒
+酗
+酚
+酝
+酞
+酡
+酢
+酣
+酤
+酥
+酩
+酪
+酬
+酮
+酯
+酰
+酱
+酲
+酴
+酵
+酶
+酷
+酸
+酹
+酺
+酽
+酾
+酿
+醂
+醅
+醇
+醉
+醋
+醌
+醍
+醐
+醑
+醒
+醚
+醛
+醢
+醣
+醪
+醭
+醮
+醯
+醴
+醵
+醺
+醿
+釆
+采
+釉
+释
+里
+重
+野
+量
+金
+釜
+釭
+釿
+鈇
+鈈
+鈊
+鈎
+鈡
+鉄
+鉏
+鉨
+鉴
+鉷
+銎
+銙
+銭
+銮
+鋆
+鋈
+鋐
+鋗
+鋬
+鋹
+錞
+錡
+錤
+録
+錾
+鍀
+鍪
+鎌
+鎏
+鎚
+鏊
+鏐
+鏖
+鐏
+鑨
+鑫
+钅
+钆
+钇
+针
+钉
+钊
+钋
+钌
+钍
+钎
+钏
+钐
+钒
+钓
+钔
+钕
+钖
+钗
+钙
+钚
+钛
+钜
+钝
+钞
+钟
+钠
+钡
+钢
+钣
+钤
+钥
+钦
+钧
+钨
+钩
+钪
+钫
+钬
+钭
+钮
+钯
+钰
+钱
+钲
+钳
+钴
+钵
+钶
+钷
+钹
+钺
+钻
+钼
+钽
+钾
+钿
+铀
+铁
+铂
+铃
+铄
+铅
+铆
+铈
+铉
+铊
+铋
+铌
+铍
+铎
+铐
+铑
+铒
+铓
+铕
+铖
+铗
+铙
+铚
+铛
+铜
+铝
+铟
+铠
+铡
+铢
+铣
+铤
+铥
+铦
+铧
+铨
+铩
+铪
+铫
+铬
+铭
+铮
+铯
+铰
+铱
+铲
+铳
+铵
+银
+铷
+铸
+铺
+铼
+铽
+链
+铿
+销
+锁
+锂
+锃
+锄
+锅
+锆
+锇
+锈
+锉
+锋
+锌
+锍
+锎
+锏
+锐
+锑
+锒
+锔
+锕
+锖
+锗
+锘
+错
+锚
+锛
+锜
+锝
+锞
+锟
+锠
+锡
+锢
+锣
+锤
+锥
+锦
+锨
+锩
+锪
+锫
+锬
+锭
+键
+锯
+锰
+锱
+锲
+锳
+锴
+锵
+锶
+锷
+锸
+锹
+锺
+锻
+锼
+锽
+镀
+镁
+镂
+镄
+镅
+镆
+镇
+镈
+镉
+镊
+镋
+镌
+镍
+镎
+镏
+镐
+镑
+镒
+镓
+镔
+镕
+镖
+镗
+镘
+镙
+镚
+镛
+镜
+镝
+镞
+镟
+镠
+镡
+镢
+镣
+镥
+镦
+镧
+镨
+镪
+镫
+镬
+镭
+镯
+镰
+镱
+镲
+镳
+镶
+长
+開
+閟
+関
+閦
+闇
+闍
+闘
+门
+闩
+闪
+闫
+闭
+问
+闯
+闰
+闱
+闲
+闳
+间
+闵
+闶
+闷
+闸
+闹
+闺
+闻
+闼
+闽
+闾
+闿
+阀
+阁
+阂
+阃
+阄
+阅
+阆
+阇
+阈
+阉
+阊
+阋
+阌
+阍
+阎
+阏
+阐
+阑
+阒
+阓
+阔
+阕
+阖
+阗
+阙
+阚
+阛
+阜
+阝
+队
+阡
+阪
+阮
+阱
+防
+阳
+阴
+阵
+阶
+阻
+阼
+阿
+陀
+陂
+附
+际
+陆
+陇
+陈
+陉
+陋
+陌
+降
+限
+陔
+陕
+陛
+陞
+陟
+陡
+院
+除
+陨
+险
+陪
+陬
+陲
+陵
+陶
+陷
+険
+隂
+隅
+隆
+隈
+隋
+隍
+随
+隐
+隔
+隗
+隘
+隙
+障
+隠
+隣
+隧
+隰
+隳
+隶
+隹
+隼
+隽
+难
+雀
+雁
+雄
+雅
+集
+雇
+雉
+雊
+雌
+雍
+雎
+雏
+雑
+雒
+雕
+雠
+雨
+雩
+雪
+雫
+雯
+雱
+雳
+零
+雷
+雹
+雾
+需
+霁
+霂
+霄
+霅
+霆
+震
+霈
+霉
+霊
+霍
+霎
+霏
+霓
+霖
+霙
+霜
+霞
+霪
+霭
+霰
+露
+霸
+霹
+霾
+靑
+青
+靓
+靖
+静
+靛
+非
+靠
+靡
+面
+靥
+革
+靬
+靳
+靴
+靶
+靺
+靼
+鞅
+鞋
+鞍
+鞑
+鞔
+鞘
+鞞
+鞠
+鞣
+鞥
+鞨
+鞫
+鞬
+鞭
+鞮
+鞯
+鞴
+韘
+韡
+韦
+韧
+韩
+韪
+韫
+韬
+韭
+音
+韵
+韶
+頔
+頞
+頠
+頫
+頵
+頼
+顒
+顔
+顕
+顗
+页
+顶
+顷
+顸
+项
+顺
+须
+顼
+顽
+顾
+顿
+颀
+颁
+颂
+颃
+预
+颅
+领
+颇
+颈
+颉
+颊
+颋
+颌
+颍
+颎
+颏
+颐
+频
+颓
+颔
+颖
+颗
+题
+颙
+颚
+颛
+颜
+额
+颞
+颟
+颠
+颡
+颢
+颤
+颦
+颧
+风
+飏
+飐
+飑
+飒
+飓
+飕
+飖
+飘
+飙
+飚
+飞
+食
+飧
+飨
+餍
+餐
+餗
+餮
+饔
+饕
+饣
+饥
+饦
+饧
+饨
+饩
+饪
+饫
+饬
+饭
+饮
+饯
+饰
+饱
+饲
+饴
+饵
+饶
+饷
+饸
+饹
+饺
+饼
+饽
+饿
+馀
+馁
+馃
+馄
+馅
+馆
+馇
+馈
+馊
+馋
+馍
+馏
+馐
+馑
+馒
+馓
+馔
+馕
+首
+馗
+馘
+香
+馥
+馨
+駄
+駅
+駆
+騄
+騑
+騒
+験
+驎
+驒
+驩
+马
+驭
+驮
+驯
+驰
+驱
+驳
+驴
+驶
+驷
+驸
+驹
+驺
+驻
+驼
+驽
+驾
+驿
+骀
+骁
+骂
+骃
+骄
+骅
+骆
+骇
+骈
+骉
+骊
+骋
+验
+骍
+骎
+骏
+骐
+骑
+骓
+骕
+骖
+骗
+骘
+骙
+骚
+骛
+骜
+骝
+骞
+骟
+骠
+骡
+骢
+骤
+骥
+骧
+骨
+骰
+骱
+骶
+骷
+骸
+骺
+骼
+髀
+髁
+髂
+髃
+髅
+髈
+髋
+髌
+髎
+髑
+髓
+高
+髙
+髡
+髦
+髪
+髫
+髭
+髯
+髹
+髻
+鬃
+鬈
+鬐
+鬓
+鬘
+鬟
+鬣
+鬯
+鬲
+鬶
+鬻
+鬼
+魁
+魂
+魃
+魄
+魅
+魆
+魇
+魈
+魉
+魋
+魍
+魏
+魑
+魔
+魟
+鮀
+鮈
+鮋
+鮟
+鮠
+鮨
+鮰
+鰕
+鰤
+鱀
+鱇
+鱓
+鱬
+鱲
+鱻
+鱼
+鱿
+鲀
+鲁
+鲂
+鲃
+鲅
+鲆
+鲇
+鲈
+鲉
+鲊
+鲋
+鲌
+鲍
+鲎
+鲏
+鲐
+鲑
+鲔
+鲕
+鲗
+鲘
+鲙
+鲚
+鲛
+鲜
+鲞
+鲟
+鲠
+鲡
+鲢
+鲣
+鲤
+鲥
+鲦
+鲧
+鲨
+鲩
+鲫
+鲭
+鲮
+鲱
+鲲
+鲳
+鲴
+鲵
+鲶
+鲷
+鲸
+鲹
+鲺
+鲻
+鲼
+鲽
+鲿
+鳀
+鳃
+鳄
+鳅
+鳇
+鳉
+鳊
+鳌
+鳍
+鳎
+鳏
+鳐
+鳑
+鳓
+鳔
+鳕
+鳖
+鳗
+鳙
+鳚
+鳜
+鳝
+鳞
+鳟
+鳡
+鳢
+鳣
+鳯
+鳽
+鳾
+鴂
+鴞
+鴷
+鵖
+鵙
+鵟
+鵺
+鶒
+鶲
+鷇
+鷉
+鷟
+鸂
+鸊
+鸑
+鸟
+鸠
+鸡
+鸢
+鸣
+鸥
+鸦
+鸨
+鸩
+鸪
+鸫
+鸬
+鸭
+鸮
+鸯
+鸰
+鸱
+鸲
+鸳
+鸵
+鸶
+鸷
+鸸
+鸹
+鸺
+鸻
+鸽
+鸾
+鸿
+鹀
+鹁
+鹂
+鹃
+鹄
+鹅
+鹆
+鹇
+鹈
+鹉
+鹊
+鹋
+鹌
+鹍
+鹎
+鹏
+鹑
+鹓
+鹕
+鹖
+鹗
+鹘
+鹚
+鹛
+鹜
+鹞
+鹟
+鹠
+鹡
+鹢
+鹣
+鹤
+鹦
+鹧
+鹨
+鹩
+鹪
+鹫
+鹬
+鹭
+鹮
+鹰
+鹱
+鹳
+鹾
+鹿
+麂
+麇
+麈
+麋
+麐
+麑
+麒
+麓
+麝
+麟
+麤
+麦
+麴
+麸
+麹
+麻
+麽
+麾
+麿
+黄
+黉
+黍
+黎
+黏
+黐
+黑
+黒
+黔
+默
+黙
+黛
+黜
+黝
+黟
+黠
+黡
+黢
+黥
+黧
+黩
+黯
+黻
+黼
+黾
+鼋
+鼍
+鼎
+鼐
+鼓
+鼗
+鼙
+鼠
+鼢
+鼩
+鼬
+鼯
+鼱
+鼷
+鼹
+鼻
+鼽
+鼾
+齁
+齐
+齑
+齢
+齮
+齿
+龀
+龁
+龃
+龄
+龅
+龆
+龇
+龈
+龉
+龊
+龋
+龌
+龑
+龘
+龙
+龚
+龛
+龟
+龠
+龢
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+AA
+AB
+AC
+AD
+AE
+AF
+AG
+AH
+AI
+AJ
+AK
+AL
+AM
+AN
+AP
+AQ
+AR
+AS
+AT
+AU
+AV
+AW
+AX
+AZ
+Al
+An
+Au
+Aw
+BA
+BB
+BC
+BD
+BE
+BF
+BG
+BH
+BI
+BJ
+BK
+BL
+BM
+BN
+BO
+BP
+BQ
+BR
+BS
+BT
+BU
+BV
+BW
+BY
+Bo
+Br
+Bu
+CA
+CB
+CC
+CD
+CE
+CF
+CG
+CH
+CI
+CJ
+CK
+CL
+CM
+CN
+CO
+CP
+CQ
+CR
+CS
+CT
+CU
+CV
+CW
+CX
+CY
+CZ
+Ca
+Ch
+Cl
+Co
+Cu
+DA
+DB
+DC
+DD
+DE
+DF
+DG
+DH
+DI
+DJ
+DK
+DL
+DM
+DN
+DO
+DQ
+DR
+DS
+DT
+DV
+DW
+DX
+DY
+DZ
+Da
+De
+Di
+Do
+Dr
+Du
+EA
+EB
+EC
+ED
+EE
+EF
+EG
+EH
+EI
+EK
+EL
+EM
+EN
+EP
+EQ
+ER
+ES
+ET
+EU
+EV
+EW
+EX
+EZ
+Ed
+En
+Ev
+Ex
+FA
+FB
+FC
+FD
+FE
+FF
+FG
+FH
+FI
+FJ
+FL
+FM
+FN
+FO
+FP
+FR
+FS
+FT
+FU
+FW
+FX
+FY
+FZ
+Fa
+Fi
+Fl
+Fo
+Fr
+Fu
+GA
+GB
+GC
+GD
+GE
+GF
+GG
+GH
+GI
+GJ
+GK
+GL
+GM
+GN
+GO
+GP
+GQ
+GR
+GS
+GT
+GU
+GW
+GX
+GY
+GZ
+Ga
+Go
+Gr
+Gu
+HA
+HB
+HC
+HD
+HE
+HF
+HG
+HH
+HI
+HJ
+HK
+HL
+HO
+HP
+HQ
+HR
+HS
+HT
+HU
+HV
+HW
+HX
+HY
+HZ
+Ha
+He
+Hi
+Ho
+Hu
+Hz
+IB
+IC
+ID
+IE
+IF
+IG
+IH
+II
+IK
+IL
+IM
+IN
+IO
+IP
+IQ
+IR
+IS
+IT
+IU
+IV
+IX
+If
+In
+JA
+JB
+JC
+JD
+JF
+JG
+JH
+JI
+JJ
+JK
+JL
+JM
+JO
+JP
+JQ
+JR
+JS
+JT
+JU
+JW
+JX
+JY
+JZ
+Ja
+Ji
+Jo
+Ju
+KA
+KB
+KC
+KD
+KE
+KF
+KG
+KH
+KI
+KJ
+KK
+KL
+KM
+KN
+KO
+KP
+KR
+KS
+KT
+KV
+KW
+KX
+KY
+KZ
+LA
+LB
+LC
+LD
+LE
+LF
+LG
+LH
+LI
+LJ
+LK
+LL
+LM
+LN
+LO
+LP
+LQ
+LR
+LS
+LT
+LU
+LV
+LW
+LX
+LY
+LZ
+La
+Le
+Li
+Lo
+Lu
+MA
+MB
+MC
+MD
+ME
+MF
+MG
+MH
+MI
+MJ
+MK
+ML
+MM
+MN
+MO
+MP
+MQ
+MR
+MS
+MT
+MU
+MV
+MW
+MX
+MY
+Ma
+Me
+Mi
+Mo
+Mu
+My
+NA
+NB
+NC
+ND
+NE
+NF
+NG
+NH
+NI
+NJ
+NK
+NL
+NN
+NO
+NP
+NR
+NS
+NT
+NU
+NV
+NW
+NX
+NY
+NZ
+Na
+Ne
+No
+Nu
+OA
+OB
+OC
+OD
+OE
+OF
+OG
+OH
+OK
+OL
+OM
+ON
+OO
+OP
+OR
+OS
+OT
+OU
+OV
+OZ
+Of
+Oh
+On
+Op
+Or
+Ou
+Ox
+PA
+PB
+PC
+PD
+PE
+PF
+PG
+PH
+PI
+PJ
+PK
+PL
+PM
+PN
+PO
+PP
+PQ
+PR
+PS
+PT
+PU
+PV
+PW
+PX
+Pa
+Ph
+Pl
+Po
+Pr
+Pu
+QA
+QB
+QC
+QE
+QF
+QG
+QJ
+QL
+QQ
+QR
+QS
+QT
+QU
+QW
+QY
+Qi
+Qu
+RA
+RB
+RC
+RE
+RF
+RG
+RH
+RI
+RJ
+RK
+RL
+RM
+RN
+RO
+RP
+RQ
+RR
+RS
+RT
+RV
+RW
+RX
+RZ
+Ra
+Re
+Ro
+Ru
+SA
+SB
+SC
+SD
+SE
+SF
+SG
+SH
+SI
+SJ
+SK
+SL
+SM
+SN
+SO
+SP
+SQ
+SR
+SS
+ST
+SU
+SV
+SW
+SX
+SY
+SZ
+Sc
+Sh
+So
+Sp
+St
+Su
+Sw
+Sy
+TA
+TB
+TC
+TD
+TE
+TF
+TG
+TH
+TI
+TJ
+TK
+TL
+TM
+TN
+TO
+TP
+TQ
+TR
+TS
+TT
+TU
+TV
+TW
+TX
+TY
+TZ
+Th
+To
+Tr
+Tw
+UA
+UC
+UD
+UE
+UF
+UG
+UH
+UI
+UK
+UL
+UM
+UN
+UP
+UR
+US
+UT
+UU
+UV
+UW
+UX
+Ub
+Un
+Up
+VA
+VB
+VC
+VE
+VF
+VG
+VH
+VI
+VJ
+VK
+VL
+VM
+VN
+VO
+VP
+VR
+VS
+VT
+VU
+VV
+VX
+Vi
+Vo
+WA
+WB
+WC
+WE
+WH
+WI
+WJ
+WL
+WM
+WN
+WO
+WQ
+WR
+WS
+WT
+WU
+WW
+WX
+WZ
+Wa
+We
+Wi
+Wo
+Wu
+XB
+XC
+XD
+XF
+XG
+XH
+XI
+XJ
+XK
+XL
+XM
+XO
+XP
+XQ
+XR
+XS
+XT
+XU
+XV
+XW
+XX
+XY
+XZ
+Xi
+Xu
+YA
+YB
+YC
+YD
+YE
+YF
+YG
+YH
+YJ
+YL
+YM
+YO
+YP
+YS
+YT
+YU
+YX
+YY
+YZ
+Ya
+Yo
+Yu
+ZA
+ZB
+ZC
+ZD
+ZE
+ZF
+ZG
+ZH
+ZI
+ZJ
+ZL
+ZM
+ZN
+ZO
+ZQ
+ZR
+ZS
+ZU
+ZW
+ZX
+ZY
+ZZ
+Zh
+ab
+aj
+an
+ap
+ar
+bb
+be
+bj
+bo
+bu
+by
+ca
+cb
+cf
+ch
+cl
+cm
+co
+cp
+cv
+dB
+da
+de
+di
+dj
+dn
+do
+dr
+dv
+ed
+em
+en
+ep
+eq
+ev
+ex
+ez
+fa
+fe
+ff
+fi
+fl
+fo
+fr
+fu
+gb
+gd
+gh
+gi
+go
+gp
+gr
+gu
+gz
+ha
+he
+hi
+ho
+hp
+hz
+iP
+iT
+ib
+ic
+id
+if
+ig
+im
+in
+io
+ip
+iq
+is
+it
+jQ
+ja
+ji
+jj
+jo
+jq
+ju
+kJ
+kN
+kW
+kg
+kn
+kz
+la
+ld
+le
+lg
+li
+ll
+lo
+lp
+lz
+ma
+mb
+me
+mi
+mm
+mo
+mp
+mq
+mu
+mv
+my
+na
+nb
+ng
+no
+nv
+ob
+of
+oh
+ok
+ol
+on
+op
+or
+ou
+ow
+oz
+pH
+pa
+pc
+ph
+pk
+pl
+po
+pp
+pr
+pu
+pv
+qf
+qq
+qu
+qz
+ra
+re
+rn
+ro
+rq
+se
+sh
+sk
+so
+sp
+sq
+st
+su
+sw
+sz
+th
+ti
+to
+tr
+tv
+tw
+ub
+uc
+uf
+ui
+uk
+un
+up
+us
+uv
+ux
+uz
+vc
+vi
+vo
+vr
+wa
+we
+wh
+wi
+wo
+wr
+ww
+xj
+xq
+xx
+ya
+ye
+yj
+yo
+yu
+yy
+yz
+zf
+zh
+zi
+zj
+zq
+zu
+zz
+AAA
+AAC
+ABA
+ABB
+ABC
+ABO
+ABS
+ABT
+ACA
+ACC
+ACD
+ACE
+ACG
+ACK
+ACL
+ACM
+ACP
+ACR
+ACS
+ACT
+ADA
+ADC
+ADD
+ADF
+ADI
+ADO
+ADP
+ADR
+ADS
+ADV
+AED
+AES
+AFC
+AFP
+AFS
+AGB
+AGC
+AGE
+AGM
+AGP
+AGV
+AIA
+AIC
+AIG
+AIM
+AIP
+AIR
+AIS
+AIX
+AKB
+AKM
+ALA
+ALL
+ALT
+AMA
+AMC
+AMD
+AMG
+AMI
+AML
+AMP
+AMR
+AMS
+AMT
+AMX
+AND
+AOC
+AOE
+AOL
+APA
+APC
+APE
+APG
+API
+APK
+APL
+APM
+APP
+APS
+APT
+APU
+ARA
+ARC
+ARE
+ARM
+ARP
+ART
+ASA
+ASC
+ASF
+ASM
+ASP
+ASR
+AST
+ATA
+ATC
+ATF
+ATI
+ATK
+ATM
+ATP
+ATS
+ATV
+ATX
+AUC
+AUG
+AUX
+AVC
+AVG
+AVI
+AVR
+AVS
+AVX
+AWM
+AWS
+All
+And
+Ang
+App
+Aqu
+BAC
+BAD
+BAE
+BAR
+BAT
+BAU
+BBA
+BBB
+BBC
+BBE
+BBQ
+BBS
+BBT
+BCD
+BEA
+BEC
+BEI
+BET
+BGA
+BGM
+BGP
+BIG
+BIM
+BIS
+BLG
+BMC
+BMD
+BMG
+BMI
+BMP
+BMW
+BMX
+BNC
+BOD
+BOM
+BOT
+BOX
+BOY
+BPM
+BPO
+BRN
+BRT
+BSA
+BSC
+BSD
+BSI
+BSM
+BSP
+BSS
+BTC
+BTR
+BTS
+BTV
+BUG
+BUN
+BUS
+BWV
+Bur
+Bus
+But
+CAA
+CAC
+CAD
+CAE
+CAI
+CAJ
+CAM
+CAN
+CAP
+CAR
+CAS
+CAT
+CBA
+CBC
+CBD
+CBN
+CBR
+CBS
+CCA
+CCC
+CCD
+CCF
+CCG
+CCI
+CCK
+CCM
+CCN
+CCP
+CCS
+CDC
+CDM
+CDN
+CDO
+CDP
+CDR
+CDS
+CEA
+CEC
+CEO
+CES
+CET
+CFA
+CFC
+CFD
+CFO
+CFR
+CGI
+CHA
+CHM
+CHO
+CIA
+CIC
+CID
+CIE
+CIF
+CIK
+CIO
+CIP
+CIS
+CLA
+CLI
+CLM
+CLS
+CMA
+CMC
+CME
+CML
+CMM
+CMO
+CMP
+CMS
+CMV
+CNC
+CNG
+CNN
+CNS
+COB
+COC
+COD
+COM
+CON
+COO
+COP
+COS
+COX
+CPA
+CPC
+CPE
+CPI
+CPL
+CPM
+CPP
+CPR
+CPS
+CPU
+CQC
+CRC
+CRM
+CRP
+CRS
+CRT
+CSA
+CSF
+CSI
+CSM
+CSP
+CSR
+CSS
+CST
+CTA
+CTC
+CTI
+CTO
+CTP
+CTS
+CUB
+CUT
+CVD
+CVN
+CVS
+CVT
+CXW
+CYP
+Car
+Cha
+Chr
+Chu
+Com
+Con
+Cou
+Cur
+DAB
+DAC
+DAO
+DAS
+DAT
+DAY
+DBA
+DBM
+DCD
+DCE
+DCF
+DCS
+DCT
+DDC
+DDD
+DDG
+DDN
+DDR
+DDS
+DDT
+DEA
+DEC
+DEM
+DES
+DFS
+DFT
+DHA
+DHL
+DIC
+DID
+DIF
+DIN
+DIP
+DIV
+DIY
+DLC
+DLL
+DLP
+DLT
+DMA
+DMC
+DMD
+DMF
+DMI
+DMO
+DMZ
+DNA
+DNF
+DNS
+DNV
+DOC
+DOI
+DOM
+DON
+DOS
+DOT
+DPI
+DPP
+DPS
+DRM
+DRX
+DSA
+DSC
+DSG
+DSL
+DSM
+DSP
+DSS
+DTC
+DTE
+DTM
+DTS
+DTU
+DVB
+DVD
+DVI
+DVR
+DWG
+DYG
+Day
+Div
+Don
+Dou
+Dow
+EAN
+EAP
+EBD
+EBS
+ECC
+ECM
+ECO
+ECT
+ECU
+ECW
+EDA
+EDG
+EDI
+EDM
+EDP
+EDR
+EEG
+EEP
+EFR
+EGF
+EHS
+EIA
+EJB
+EMA
+EMC
+EMI
+EMP
+EMS
+END
+EOS
+EPA
+EPC
+EPO
+EPR
+EPS
+ERP
+ESC
+ESD
+ESI
+ESL
+ESP
+ESR
+EST
+ETC
+ETF
+ETH
+ETL
+ETS
+EVA
+EVE
+EVO
+EXE
+EXO
+EXP
+EYE
+Eff
+Ell
+Emb
+Emp
+End
+Eng
+Equ
+Eur
+Eva
+Exc
+Exp
+FAA
+FAB
+FAG
+FAL
+FAN
+FAO
+FAQ
+FAT
+FBI
+FCA
+FCC
+FCI
+FCS
+FDA
+FDD
+FDI
+FEM
+FES
+FET
+FFT
+FGO
+FHD
+FIA
+FLV
+FLY
+FMS
+FNC
+FOB
+FOF
+FOR
+FOX
+FPC
+FPS
+FPX
+FRP
+FSA
+FSB
+FSC
+FSH
+FTA
+FTC
+FTP
+FUE
+FUN
+Fin
+Fiv
+Fly
+For
+Fou
+Fuj
+Fun
+Fut
+GAP
+GAT
+GAY
+GBA
+GBK
+GBT
+GBU
+GCC
+GCS
+GCT
+GDI
+GDP
+GEN
+GEO
+GET
+GFP
+GHz
+GIA
+GIF
+GIS
+GLA
+GLC
+GLP
+GLS
+GMA
+GMC
+GMP
+GMS
+GMT
+GMV
+GND
+GNP
+GNU
+GOD
+GOT
+GPA
+GPL
+GPS
+GPT
+GPU
+GRC
+GRE
+GRF
+GSH
+GSM
+GSP
+GTA
+GTI
+GTO
+GTP
+GTR
+GTS
+GTX
+GUI
+Giv
+Gmb
+Gua
+Gui
+Gun
+Guo
+Guy
+HAD
+HAL
+HBA
+HBO
+HBV
+HBs
+HCG
+HCI
+HCV
+HCl
+HDD
+HDL
+HDR
+HDV
+HEY
+HFC
+HGH
+HGT
+HID
+HIP
+HIS
+HIT
+HIV
+HLA
+HMG
+HMI
+HMS
+HOP
+HOT
+HOW
+HPC
+HPV
+HRC
+HRT
+HSE
+HSK
+HSV
+HTC
+HUB
+HUD
+HVG
+Haz
+Her
+Hom
+Hon
+Hou
+How
+Hua
+Hub
+Hum
+Hun
+IAI
+IAS
+IAT
+IBC
+IBF
+IBM
+ICA
+ICC
+ICD
+ICE
+ICO
+ICP
+ICQ
+ICS
+ICT
+ICU
+IDC
+IDD
+IDE
+IDF
+IDG
+IDS
+IEC
+IET
+IFA
+IFC
+IFI
+IFN
+IGF
+IGN
+IIA
+III
+IIS
+IKO
+IMA
+IMC
+IMD
+IME
+IMF
+IMG
+IMO
+IMS
+IMT
+INA
+INC
+INF
+ING
+INS
+INT
+IOS
+IPA
+IPC
+IPO
+IPS
+IPX
+IRC
+IRI
+ISA
+ISI
+ISM
+ISO
+ISP
+ITC
+ITF
+ITO
+ITS
+ITT
+ITU
+ITV
+IVR
+Imp
+InC
+Inf
+Inj
+Int
+JAR
+JBL
+JBT
+JCB
+JCR
+JDB
+JDG
+JET
+JGJ
+JIS
+JIT
+JKL
+JOE
+JPG
+JSF
+JSP
+JST
+JTA
+JVC
+JVM
+JYJ
+JYP
+Jac
+Jam
+Jan
+Jap
+Jav
+Jay
+Jin
+Joh
+Jon
+Jul
+Jun
+Jus
+KAB
+KAT
+KBS
+KDF
+KDJ
+KEY
+KFC
+KFR
+KID
+KIS
+KJm
+KOF
+KOH
+KOL
+KPI
+KPL
+KTV
+KVM
+Kin
+Kon
+Kur
+LAB
+LAN
+LBS
+LCA
+LCD
+LCK
+LCS
+LDA
+LDH
+LDL
+LDP
+LED
+LEE
+LEO
+LES
+LET
+LGA
+LGD
+LIN
+LIU
+LLC
+LME
+LMS
+LNG
+LOF
+LOL
+LOW
+LPG
+LPL
+LPR
+LRC
+LSA
+LSD
+LSI
+LSP
+LTD
+LTE
+LUC
+LUN
+LVM
+Laz
+Lib
+Lif
+Lin
+Liu
+Liz
+Lon
+Lou
+Low
+Luc
+Lum
+Luo
+Lux
+MAC
+MAD
+MAG
+MAN
+MAO
+MAP
+MAR
+MAS
+MAT
+MAX
+MAY
+MBA
+MBC
+MBO
+MBR
+MBS
+MCA
+MCC
+MCM
+MCN
+MCP
+MCS
+MCU
+MDA
+MDI
+MDL
+MDR
+MDS
+MEN
+MES
+MFA
+MFC
+MHC
+MHz
+MIB
+MIC
+MID
+MIL
+MIN
+MIS
+MIT
+MIX
+MKV
+MLC
+MLF
+MMA
+MMC
+MMI
+MMO
+MMS
+MMX
+MOD
+MOM
+MOS
+MOV
+MPA
+MPC
+MPG
+MPI
+MPS
+MPV
+MPa
+MRC
+MRI
+MRO
+MRP
+MSA
+MSC
+MSI
+MSN
+MTI
+MTK
+MTS
+MTU
+MTV
+MVC
+MVP
+Mac
+Mag
+Maj
+Man
+Mar
+Max
+May
+Mic
+Min
+Mon
+Mou
+Mur
+NAD
+NAS
+NAT
+NBA
+NBC
+NBL
+NCT
+NDS
+NEC
+NEO
+NES
+NET
+NEW
+NEX
+NFA
+NFC
+NFL
+NFS
+NGC
+NGN
+NGO
+NHK
+NHL
+NIC
+NIH
+NLP
+NME
+NMR
+NOT
+NOW
+NOX
+NOx
+NPC
+NPN
+NPR
+NSA
+NSC
+NSF
+NSK
+NTN
+NTP
+NTT
+NTV
+NVH
+NWA
+NXT
+NYT
+Nic
+Nob
+Nor
+Nov
+Now
+Nur
+OAD
+OBD
+OCG
+OCP
+OCR
+OCT
+ODM
+OEM
+OFF
+OGG
+OLE
+OMG
+ONE
+ONU
+OOO
+OPC
+OPP
+ORC
+OSD
+OSI
+OSS
+OST
+OTA
+OTC
+OTG
+OTT
+OUT
+OVA
+OVP
+Obj
+Off
+Oly
+Ope
+Oph
+Opt
+Our
+Out
+Ove
+PAC
+PAD
+PAH
+PAL
+PAM
+PAN
+PAS
+PBS
+PBT
+PCA
+PCB
+PCD
+PCI
+PCL
+PCM
+PCR
+PCS
+PCT
+PDA
+PDB
+PDC
+PDD
+PDF
+PDM
+PDP
+PDU
+PEG
+PEP
+PER
+PES
+PET
+PFA
+PFC
+PGA
+PGC
+PHP
+PHS
+PIC
+PID
+PIM
+PIN
+PKI
+PLA
+PLC
+PLD
+PLL
+PLM
+PMC
+PMI
+PMP
+PND
+PNG
+PNP
+POE
+POM
+PON
+POP
+POS
+PPA
+PPC
+PPG
+PPH
+PPI
+PPM
+PPP
+PPR
+PPS
+PPT
+PPV
+PRL
+PRO
+PSA
+PSD
+PSE
+PSG
+PSI
+PSK
+PSP
+PSS
+PSV
+PSW
+PSY
+PTA
+PTC
+PTH
+PTT
+PUB
+PVA
+PVC
+PVE
+PVP
+PWM
+Par
+Per
+Pic
+Pow
+Pro
+Pur
+QAM
+QDI
+QFP
+QGh
+QOS
+QPI
+QPS
+QRS
+QTL
+Qin
+Qua
+Que
+RAM
+RAP
+RAR
+RAS
+RAW
+RBC
+RCA
+RCS
+RDF
+RDS
+RED
+REF
+REG
+REM
+REX
+RFC
+RGB
+RIA
+RIM
+RIP
+RMB
+RMS
+RNA
+RNG
+ROC
+ROE
+ROI
+ROM
+RPC
+RPG
+RPM
+RRW
+RSA
+RSC
+RSI
+RSS
+RTA
+RTC
+RTK
+RTP
+RTS
+RTU
+RTX
+RUN
+RUS
+Ray
+Raz
+Ric
+Riv
+Rom
+Rou
+Rub
+Run
+Rus
+SAC
+SAE
+SAM
+SAN
+SAO
+SAP
+SAR
+SAS
+SAT
+SAY
+SBR
+SBS
+SCE
+SCH
+SCI
+SCM
+SCP
+SCR
+SDH
+SDI
+SDK
+SDR
+SDS
+SEA
+SEC
+SEE
+SEM
+SEO
+SER
+SET
+SFC
+SFP
+SGH
+SGI
+SGS
+SHA
+SHE
+SID
+SIG
+SIM
+SIP
+SIR
+SIS
+SKF
+SKT
+SKU
+SKY
+SLA
+SLC
+SLE
+SLG
+SLI
+SLR
+SLS
+SMA
+SMB
+SMC
+SMD
+SMG
+SMI
+SMP
+SMS
+SMT
+SNK
+SNP
+SNR
+SNS
+SOA
+SOC
+SOD
+SOI
+SOP
+SOS
+SPA
+SPC
+SPD
+SPE
+SPF
+SPI
+SPR
+SPS
+SPT
+SPV
+SQL
+SQU
+SRS
+SRT
+SSA
+SSC
+SSD
+SSE
+SSH
+SSL
+SSR
+SSS
+SST
+STC
+STD
+STK
+STL
+STM
+STN
+STP
+STR
+STS
+SUB
+SUN
+SUV
+SVC
+SVD
+SVG
+SVM
+SWF
+SXG
+SYN
+SYS
+Sch
+Ser
+She
+Siz
+Som
+Sou
+Squ
+Sub
+Sum
+Sun
+Sup
+Suz
+TAB
+TAC
+TAG
+TAO
+TBC
+TBM
+TBS
+TCG
+TCL
+TCM
+TCO
+TCP
+TCR
+TCS
+TCT
+TDD
+TDI
+TDM
+TDP
+TDS
+TEC
+TED
+TEL
+TEM
+TES
+TEU
+TEX
+TFT
+TGA
+TGF
+TGV
+THD
+THE
+TIA
+TIF
+TKO
+TLC
+TLS
+TMD
+TMP
+TMS
+TMT
+TNA
+TNF
+TNT
+TOC
+TOD
+TOE
+TOM
+TOP
+TPC
+TPE
+TPM
+TPO
+TPP
+TPR
+TPS
+TPU
+TQM
+TSC
+TSH
+TSI
+TSP
+TTL
+TTS
+TTT
+TUV
+TVB
+TVC
+TVP
+TVS
+TWO
+TXT
+Tay
+The
+Tom
+Tou
+Tow
+Tur
+UAR
+UBC
+UCC
+UCL
+UDP
+UFC
+UFO
+UGC
+UHF
+UIP
+UMD
+UML
+UNI
+UPC
+UPS
+URL
+USA
+USB
+USD
+USM
+USP
+USS
+UTC
+UTF
+UTP
+UTR
+UVA
+UVB
+UWB
+UZI
+Umb
+Uni
+Upp
+Uzi
+VAC
+VAR
+VBA
+VBR
+VBS
+VCC
+VCD
+VCR
+VDC
+VDE
+VGA
+VHF
+VHS
+VIA
+VII
+VIP
+VIS
+VMw
+VOA
+VOB
+VOC
+VOD
+VOL
+VPN
+VPS
+VRP
+VSS
+VTE
+VVT
+Ver
+Vic
+Vid
+Vis
+Viv
+WAN
+WAP
+WAV
+WAY
+WBA
+WBC
+WBO
+WBS
+WCG
+WCW
+WDM
+WDS
+WEB
+WEP
+WEY
+WGK
+WHO
+WIN
+WMA
+WMS
+WMV
+WOW
+WPA
+WPF
+WPS
+WRC
+WSA
+WTA
+WTI
+WTO
+WVG
+WWE
+WWF
+WWW
+Way
+Wha
+Whe
+Whi
+Who
+Why
+WiF
+Win
+Wiz
+Wom
+Wor
+Wou
+XGA
+XII
+XML
+XPS
+XXX
+XYZ
+YAG
+YES
+YOU
+YZB
+Yin
+You
+Yua
+Yuk
+Yun
+ZIP
+ZOL
+Zer
+Zha
+Zhu
+Zom
+Zon
+Zou
+abb
+abc
+abo
+abs
+act
+adj
+aff
+all
+and
+ang
+any
+app
+aws
+bbb
+bbc
+bbq
+bbs
+but
+cAM
+cDN
+cGM
+can
+car
+cba
+cha
+chi
+col
+com
+con
+cor
+cou
+cpi
+cpu
+dan
+day
+des
+did
+dif
+dis
+div
+diy
+doc
+don
+dow
+eAA
+eSA
+ech
+eff
+emb
+emp
+end
+eng
+eqc
+equ
+euv
+eve
+exc
+exe
+exp
+fac
+fil
+fin
+fir
+fiv
+fla
+fly
+for
+fox
+fre
+fri
+gAS
+gdp
+gen
+giv
+gmp
+gon
+goo
+got
+gps
+gra
+gre
+gro
+had
+har
+has
+hav
+haz
+her
+his
+hiv
+hol
+hom
+hou
+how
+iBT
+iOS
+iPa
+iPh
+iPo
+iSC
+ima
+imp
+inc
+inf
+inj
+int
+ipa
+iph
+ipo
+isb
+iso
+jam
+jap
+jav
+jay
+jus
+kHz
+kJm
+kdj
+kin
+lay
+laz
+lck
+lea
+led
+let
+lib
+lif
+lin
+liq
+lis
+lit
+liv
+liz
+lly
+lng
+loc
+lof
+log
+loo
+los
+low
+mRN
+mac
+mad
+maj
+man
+mar
+mat
+max
+may
+maz
+mba
+men
+mic
+min
+mmH
+mod
+mon
+mor
+mys
+nVI
+nba
+nex
+nic
+not
+nov
+now
+nxp
+obj
+off
+one
+ope
+opp
+our
+out
+ove
+par
+pay
+per
+phe
+php
+piz
+pla
+pow
+ppp
+pre
+pro
+pvc
+qHD
+qgh
+qua
+que
+qui
+rRN
+ray
+raz
+rea
+rec
+red
+ref
+reg
+rem
+rep
+req
+res
+rev
+ric
+riv
+rmb
+rng
+rom
+rou
+say
+sch
+sha
+she
+shi
+sho
+sim
+sin
+siz
+som
+sou
+spa
+spe
+sql
+squ
+sta
+ste
+sto
+str
+sty
+sub
+suv
+tRN
+tha
+the
+thi
+thr
+tim
+tip
+top
+tow
+tpp
+tra
+tur
+tuv
+two
+ubc
+uiv
+unc
+und
+uni
+unk
+ups
+usb
+uva
+uvb
+uzi
+val
+var
+ver
+vie
+vip
+vis
+viv
+wan
+was
+way
+web
+wer
+wha
+whi
+who
+why
+wif
+wit
+wom
+won
+wor
+wou
+www
+xin
+xxx
+yin
+you
+zha
+zhi
+zho
+zhu
+zon
+zzf
+zzy
+AAAA
+AACS
+ABCD
+ACCA
+ACCE
+ACCP
+ACDC
+ACGN
+ACID
+ACPI
+ACTH
+ADHD
+ADPC
+ADSL
+AIDS
+AJAX
+ALPH
+AMEX
+AMOL
+ANGE
+ANSI
+ANSY
+APEC
+APPL
+APTE
+ARDS
+ARPA
+ARPG
+ASCE
+ASCI
+ASIA
+ASIC
+ASIN
+ASME
+ASSO
+ASTM
+ASUS
+AUDI
+AUTO
+AVCH
+AWAR
+Andr
+BABY
+BACK
+BAND
+BANG
+BANK
+BASI
+BASS
+BATT
+BEAS
+BEAT
+BEST
+BEYO
+BIGB
+BIOS
+BLAC
+BLEA
+BLOG
+BLOO
+BLUE
+BOBO
+BOOK
+BOOL
+BOOM
+BOPP
+BOSS
+BOYS
+BRAV
+BREA
+BUFF
+Buck
+Buff
+Bull
+Bung
+Buzz
+CADC
+CALL
+CAPC
+CAPP
+CARD
+CASE
+CASI
+CAST
+CATI
+CATV
+CAXA
+CCFL
+CCIE
+CCNA
+CCTV
+CDMA
+CEPA
+CERN
+CHAN
+CHAP
+CHAR
+CHEN
+CHIN
+CHOR
+CIMS
+CIPA
+CISC
+CITE
+CITY
+CLAM
+CLAN
+CLAS
+CLOS
+CLUB
+CMMB
+CMMI
+CMOS
+CMYK
+CNAS
+CNBC
+CNBL
+CNKI
+CNNI
+COCO
+CODE
+COLL
+COLO
+COMB
+COME
+COMI
+COMP
+CONT
+COOL
+CORB
+CORE
+COSM
+COSP
+COST
+COUN
+COVI
+CPLD
+CREA
+CROS
+CSCD
+CSDN
+CSMA
+CSOL
+CSSC
+CSTN
+CTRL
+CUBA
+CUDA
+CURR
+CVBS
+Chin
+Chur
+DANC
+DARK
+DARP
+DASH
+DATA
+DAYS
+DCDC
+DDNS
+DDOS
+DDRI
+DELL
+DEMO
+DESI
+DEST
+DHCP
+DIGI
+DIMM
+DISC
+DIVX
+DLNA
+DOHC
+DOTA
+DOWN
+DRAG
+DRAM
+DREA
+DRIV
+DSLR
+DVDC
+DVGA
+DWDM
+DWOR
+EAST
+EASY
+EBIT
+ECMO
+EDGE
+EDIT
+EDTA
+EGFR
+EINE
+ELIS
+ELLE
+EMBA
+ENER
+ENGI
+ENTE
+EPDM
+EPIS
+EPON
+EPSO
+EPUB
+ERCP
+ERRO
+ESET
+ESPN
+ETSI
+EVDO
+EVER
+EXCE
+EXIL
+EXPO
+Ever
+Exch
+Exer
+FACE
+FALS
+FANS
+FANU
+FAST
+FDDI
+FIBA
+FIDI
+FIFA
+FIFO
+FILE
+FINA
+FIRE
+FIRS
+FISH
+FIVE
+FLAC
+FLAS
+FLOW
+FMVP
+FORT
+FPGA
+FREE
+FROM
+FTTH
+FULL
+FWVG
+FXCM
+Fuck
+Full
+Fund
+Fung
+Fuzz
+GABA
+GALA
+GAME
+GANK
+GATT
+GEAR
+GENE
+GHOS
+GIRL
+GLON
+GMAT
+GNSS
+GOLD
+GOOD
+GOOG
+GPRS
+GREE
+GROU
+GSMG
+GUCC
+GUND
+GUTS
+Gund
+HACC
+HAPP
+HARD
+HART
+HDCP
+HDMI
+HDPE
+HDTV
+HEAD
+HEAR
+HELL
+HEPA
+HERO
+HIFI
+HIGH
+HIPH
+HKEY
+HOLD
+HOME
+HOST
+HOUS
+HPLC
+HSDP
+HSPA
+HTML
+HTTP
+HUNT
+Hugh
+Hung
+ICAN
+ICMP
+ICON
+IDEA
+IDOL
+IEEE
+IELT
+IETF
+IFPI
+IGBT
+IGMP
+IMAX
+IMDB
+INFO
+INTE
+IPAD
+IPTV
+ISBN
+ISDN
+ISIS
+ISOI
+ISRC
+ISSN
+ISTP
+ITER
+ITIL
+IUCN
+Inte
+Inve
+JACK
+JAPA
+JAVA
+JAZZ
+JBOD
+JOHN
+JOJO
+JOKE
+JOUR
+JPEG
+JUMP
+JUST
+Jack
+Jake
+Jazz
+John
+Joke
+July
+Jump
+Jung
+KING
+KISS
+KONA
+KOYO
+LASI
+LAST
+LEED
+LEEP
+LESS
+LEVE
+LEXU
+LIFE
+LIKE
+LIMI
+LINE
+LINK
+LINU
+LIST
+LIVE
+LLDP
+LOCA
+LOFT
+LOGO
+LOLI
+LONG
+LOOK
+LOVE
+LPGA
+LTPS
+LVDS
+Ligh
+Like
+Lily
+Lind
+Ling
+Liqu
+Live
+Luck
+Luke
+MACD
+MACH
+MAGI
+MALL
+MAMA
+MARK
+MAST
+MATL
+MATX
+MAYA
+MBLA
+MEDI
+MEGA
+MEMS
+MERS
+META
+MIDI
+MIDP
+MIMO
+MINI
+MIPS
+MISS
+MIUI
+MMOR
+MOBA
+MODB
+MODE
+MOMO
+MOOC
+MOON
+MORE
+MOSF
+MOTO
+MOVI
+MPEG
+MPLS
+MSCI
+MSDS
+MTBF
+MUSI
+Mach
+Make
+Maur
+Mazz
+NACH
+NADH
+NADP
+NAMC
+NAME
+NANA
+NAND
+NASA
+NASD
+NATO
+NAVE
+NCAA
+NCAP
+NCIS
+NEDC
+NEOP
+NERV
+NEST
+NEWS
+NEXT
+NICO
+NIGH
+NIKE
+NINE
+NOKI
+NOTE
+NOVA
+NSAI
+NTFS
+NTSC
+NULL
+NURB
+NVID
+NYSE
+Nove
+ODBC
+OECD
+OFDM
+OFFI
+OLAP
+OLED
+ONLI
+ONLY
+OPEC
+OPEN
+OPPO
+ORAC
+ORIC
+ORIG
+OSPF
+OVER
+Oper
+PACS
+PAGE
+PARK
+PART
+PASS
+PCMC
+PDCA
+PEEK
+PERC
+PERF
+PETS
+PHEV
+PHIL
+PHOT
+PICC
+PIEC
+PLAN
+PLAY
+PLUS
+PMMA
+PNAS
+POLO
+POSE
+POST
+POWE
+PPTP
+PPTV
+PRAD
+PROD
+PROF
+PROJ
+PSTN
+PTFE
+PUNK
+PVDF
+Pric
+Prin
+Priv
+Priz
+Prom
+QFII
+QVGA
+QVOD
+QWER
+Quic
+Quin
+Quiz
+RADI
+RAID
+RAIN
+REAC
+READ
+REAL
+REIT
+RESE
+RFID
+RIDE
+RISC
+RMON
+RMRM
+RMVB
+ROAD
+ROCK
+ROHS
+ROOT
+ROSE
+RTEC
+RWBY
+Ruby
+SAAS
+SAMS
+SARS
+SATA
+SCAD
+SCAR
+SCDM
+SCHO
+SCIE
+SCSI
+SDHC
+SDMM
+SDRA
+SDSD
+SDXC
+SECA
+SECC
+SECT
+SEED
+SEGA
+SELE
+SERV
+SEVE
+SFDA
+SHIF
+SHIN
+SHOC
+SHOP
+SHOW
+SIDE
+SIEM
+SING
+SIZE
+SKIP
+SMAP
+SMAR
+SMIL
+SMTP
+SNMP
+SOAP
+SOCK
+SOHO
+SOLO
+SONG
+SONY
+SOSO
+SOUL
+SPAC
+SPCC
+SPDI
+SPEC
+SPEE
+SPIE
+SPOR
+SPSS
+SRAM
+SSCI
+STAF
+STAG
+STAR
+STAT
+STEM
+STEP
+STER
+STOP
+STOR
+STUD
+STYL
+SUMM
+SUPE
+SUSE
+SWAT
+SWIF
+SWOT
+SYST
+Subj
+Sull
+Sund
+Sung
+Supp
+TABL
+TANK
+TCPI
+TDMA
+TEAM
+TECH
+TEST
+TEXT
+TFBO
+TFSI
+TFTP
+THIS
+THRE
+TIFF
+TIME
+TIMK
+TIPS
+TOEF
+TOKY
+TOSH
+TOUC
+TOUR
+TOWN
+TRAC
+TRIP
+TRIZ
+TRUE
+TVBS
+TVOC
+TWIC
+TYPE
+Ther
+Thin
+Thom
+Thou
+UCLA
+UHMW
+ULTR
+UMTS
+UNES
+UNIT
+UNIV
+UNIX
+Unic
+Unit
+Univ
+VAIO
+VCCI
+VEGF
+VERS
+VHDL
+VIDE
+VIER
+VIII
+VISA
+VISI
+VIST
+VIVO
+VLAN
+VLSI
+VOCA
+VOGU
+VOIP
+VRay
+VSAT
+Vick
+Vill
+WANG
+WAPI
+WASD
+WAVE
+WCBA
+WCDM
+WEEK
+WEST
+WHAT
+WHIT
+WIFI
+WIND
+WITH
+WLAN
+WORD
+WORK
+WORL
+WQVG
+WXGA
+Wang
+Wher
+WiMA
+Will
+Wind
+Wing
+XBOX
+XBRL
+XHTM
+XVID
+XXXX
+YAMA
+YANG
+YEAH
+YONE
+YOUN
+YOUR
+YOYO
+Yong
+Your
+ZAFT
+ZARA
+ZERO
+ZGMF
+ZHAN
+ZONE
+Zhon
+Zhou
+abby
+abou
+andr
+appl
+baby
+back
+blic
+call
+char
+chic
+chin
+coff
+coll
+comb
+comm
+comp
+cond
+cons
+cont
+dick
+diff
+ding
+dock
+doin
+dong
+down
+ever
+exch
+find
+foll
+four
+from
+fron
+goin
+good
+goog
+gove
+hack
+hall
+hand
+hang
+happ
+have
+here
+high
+home
+into
+inve
+jack
+java
+jazz
+jump
+jung
+just
+know
+life
+ligh
+like
+lily
+ling
+liqu
+live
+lock
+logo
+lond
+long
+look
+love
+macd
+mach
+make
+mapp
+mmer
+nove
+okay
+only
+oper
+oppo
+othe
+over
+play
+pray
+pric
+prin
+priv
+priz
+prod
+prom
+quic
+real
+requ
+righ
+scho
+shou
+show
+some
+star
+stat
+stay
+stom
+subj
+such
+suff
+supp
+take
+than
+they
+thin
+thou
+toke
+uber
+unic
+univ
+upon
+usdj
+user
+usin
+vill
+vivo
+wake
+wall
+wang
+want
+wave
+were
+what
+when
+wifi
+will
+wind
+wing
+with
+work
+xing
+xxxx
+year
+your
+zhon
+China
+Inter
+Journ
+china
+every
+inter
+iphon
+thing
+think
+where
+which
+Univer
+univer
+Windows
+windows
+##A
+##B
+##C
+##D
+##E
+##F
+##G
+##H
+##I
+##J
+##K
+##L
+##M
+##N
+##O
+##P
+##Q
+##R
+##S
+##T
+##U
+##V
+##W
+##X
+##Y
+##Z
+##a
+##b
+##c
+##d
+##e
+##f
+##g
+##h
+##i
+##j
+##k
+##l
+##m
+##n
+##o
+##p
+##q
+##r
+##s
+##t
+##u
+##v
+##w
+##x
+##y
+##z
+##AA
+##AB
+##AC
+##AD
+##AE
+##AF
+##AG
+##AH
+##AI
+##AK
+##AL
+##AM
+##AN
+##AO
+##AP
+##AQ
+##AR
+##AS
+##AT
+##AV
+##AW
+##AX
+##AY
+##AZ
+##BA
+##BB
+##BC
+##BE
+##BG
+##BI
+##BM
+##BN
+##BO
+##BP
+##BR
+##BS
+##BT
+##BU
+##BY
+##CA
+##CB
+##CC
+##CD
+##CE
+##CF
+##CG
+##CH
+##CI
+##CK
+##CL
+##CM
+##CN
+##CO
+##CP
+##CR
+##CS
+##CT
+##CU
+##DA
+##DB
+##DC
+##DD
+##DE
+##DI
+##DL
+##DM
+##DN
+##DO
+##DP
+##DR
+##DS
+##DT
+##DU
+##DX
+##DY
+##EA
+##EB
+##EC
+##ED
+##EE
+##EF
+##EG
+##EI
+##EK
+##EL
+##EM
+##EN
+##EO
+##EP
+##ER
+##ES
+##ET
+##EV
+##EW
+##EX
+##EY
+##FA
+##FC
+##FD
+##FE
+##FF
+##FI
+##FL
+##FO
+##FP
+##FR
+##FS
+##FT
+##FU
+##FX
+##Fi
+##GA
+##GC
+##GE
+##GF
+##GH
+##GI
+##GL
+##GN
+##GO
+##GP
+##GR
+##GS
+##GU
+##GY
+##HA
+##HC
+##HD
+##HE
+##HG
+##HI
+##HM
+##HN
+##HO
+##HP
+##HR
+##HS
+##HT
+##IA
+##IB
+##IC
+##ID
+##IE
+##IF
+##IG
+##II
+##IK
+##IL
+##IM
+##IN
+##IO
+##IP
+##IR
+##IS
+##IT
+##IU
+##IV
+##IX
+##IZ
+##JI
+##JO
+##Jo
+##Ju
+##KA
+##KE
+##KI
+##KK
+##KO
+##KU
+##KY
+##LA
+##LC
+##LD
+##LE
+##LF
+##LG
+##LI
+##LK
+##LL
+##LM
+##LO
+##LP
+##LS
+##LT
+##LU
+##LV
+##LY
+##MA
+##MB
+##MC
+##MD
+##ME
+##MF
+##MI
+##ML
+##MM
+##MN
+##MO
+##MP
+##MR
+##MS
+##MT
+##MV
+##MY
+##NA
+##NC
+##ND
+##NE
+##NG
+##NI
+##NJ
+##NK
+##NN
+##NO
+##NP
+##NS
+##NT
+##NU
+##NX
+##NY
+##NZ
+##OB
+##OC
+##OD
+##OE
+##OF
+##OG
+##OH
+##OI
+##OK
+##OL
+##OM
+##ON
+##OO
+##OP
+##OR
+##OS
+##OT
+##OU
+##OV
+##OW
+##OX
+##PA
+##PC
+##PD
+##PE
+##PF
+##PG
+##PH
+##PI
+##PL
+##PM
+##PO
+##PP
+##PR
+##PS
+##PT
+##PU
+##QU
+##Qu
+##RA
+##RB
+##RC
+##RD
+##RE
+##RF
+##RG
+##RH
+##RI
+##RK
+##RL
+##RM
+##RN
+##RO
+##RP
+##RR
+##RS
+##RT
+##RU
+##RY
+##SA
+##SB
+##SC
+##SD
+##SE
+##SF
+##SH
+##SI
+##SK
+##SL
+##SM
+##SN
+##SO
+##SP
+##SS
+##ST
+##SU
+##SY
+##TA
+##TC
+##TD
+##TE
+##TH
+##TI
+##TM
+##TO
+##TP
+##TR
+##TS
+##TT
+##TU
+##TV
+##TY
+##Tu
+##UB
+##UC
+##UD
+##UE
+##UF
+##UG
+##UI
+##UK
+##UL
+##UM
+##UN
+##UP
+##UR
+##US
+##UT
+##VA
+##VB
+##VC
+##VD
+##VE
+##VI
+##VO
+##VP
+##VR
+##VT
+##WA
+##WC
+##WE
+##WF
+##WI
+##WL
+##WM
+##WO
+##WS
+##XA
+##XC
+##XE
+##XG
+##XO
+##XP
+##XT
+##XX
+##XY
+##YA
+##YE
+##YL
+##YO
+##YP
+##YS
+##YT
+##ZA
+##ZB
+##ZE
+##ZI
+##ZO
+##ZR
+##ZU
+##ZX
+##ZZ
+##ab
+##ag
+##al
+##am
+##an
+##ar
+##as
+##at
+##ax
+##ay
+##az
+##bi
+##bj
+##bl
+##bo
+##by
+##ce
+##ch
+##ci
+##ck
+##cq
+##ct
+##dj
+##ed
+##en
+##er
+##ew
+##ex
+##ff
+##fi
+##gh
+##gn
+##ha
+##he
+##ho
+##hz
+##ic
+##id
+##im
+##in
+##is
+##it
+##iv
+##ix
+##iz
+##jj
+##jo
+##ke
+##ky
+##kz
+##ld
+##le
+##lf
+##ll
+##ly
+##mb
+##mp
+##na
+##nc
+##nd
+##ng
+##nj
+##nk
+##nn
+##nt
+##nz
+##ob
+##oj
+##ok
+##ol
+##om
+##on
+##op
+##or
+##ou
+##ow
+##ox
+##ph
+##pp
+##pu
+##pv
+##qf
+##ql
+##qq
+##qu
+##re
+##rk
+##ro
+##ry
+##sh
+##sq
+##st
+##th
+##ty
+##ub
+##ul
+##um
+##un
+##ur
+##us
+##uv
+##ux
+##uz
+##ve
+##vi
+##wn
+##ws
+##ww
+##xp
+##xx
+##xy
+##zh
+##zy
+##zz
+##ACE
+##ACH
+##ACT
+##ADE
+##AGE
+##AIN
+##AME
+##AND
+##ANG
+##ANO
+##ANT
+##ARD
+##ARE
+##ASS
+##AST
+##ATE
+##BER
+##BLE
+##BOX
+##BSD
+##Bay
+##CAD
+##CAL
+##CAM
+##COM
+##CSE
+##DEO
+##DER
+##DIA
+##DNA
+##DSL
+##DVD
+##EAM
+##EAR
+##ECT
+##EEN
+##ENS
+##ENT
+##ERA
+##ERS
+##ESE
+##ESS
+##FTA
+##GER
+##GHT
+##GIS
+##IAL
+##IBA
+##IBU
+##ICE
+##ICS
+##IDE
+##INA
+##INE
+##ING
+##INT
+##INY
+##ION
+##IPS
+##ITE
+##IVE
+##KER
+##KON
+##LAY
+##LLA
+##LOR
+##MAN
+##MAS
+##MAX
+##MES
+##NAD
+##NAL
+##NCE
+##NET
+##NEY
+##NIC
+##NNA
+##OCK
+##ODE
+##OME
+##ONE
+##ORA
+##OWS
+##Off
+##PAC
+##PER
+##PRS
+##RAN
+##RIS
+##RNA
+##ROM
+##RON
+##ROR
+##SCO
+##SHI
+##SIC
+##SOL
+##SON
+##SQL
+##TAL
+##TED
+##TER
+##TML
+##TON
+##TRA
+##UND
+##UNG
+##UPA
+##USB
+##USE
+##VEL
+##VER
+##VGA
+##VID
+##WER
+##You
+##abl
+##aby
+##ach
+##ack
+##act
+##ain
+##ake
+##all
+##aly
+##anc
+##and
+##ang
+##ank
+##app
+##ard
+##ark
+##art
+##ary
+##ash
+##ath
+##auv
+##ave
+##avi
+##azi
+##azy
+##azz
+##bVI
+##bby
+##ber
+##bje
+##ble
+##cGI
+##cho
+##com
+##cqu
+##day
+##der
+##ebo
+##ect
+##ell
+##emb
+##enc
+##eng
+##ent
+##erJ
+##ern
+##erv
+##ery
+##eve
+##ews
+##exp
+##ext
+##ezy
+##fer
+##ffe
+##fic
+##for
+##gaz
+##ger
+##ght
+##gin
+##hen
+##her
+##hev
+##hin
+##hon
+##hou
+##iRF
+##ial
+##ica
+##ice
+##ich
+##ick
+##iff
+##igh
+##ike
+##ill
+##ily
+##ime
+##ine
+##ing
+##ink
+##ion
+##iqu
+##ish
+##ith
+##ive
+##iza
+##ize
+##izz
+##jin
+##ker
+##kin
+##lDR
+##lay
+##laz
+##lex
+##lic
+##lin
+##liz
+##llo
+##lly
+##man
+##maz
+##men
+##mer
+##min
+##mpl
+##mpo
+##nGL
+##nRH
+##nal
+##ner
+##ngz
+##niz
+##now
+##nxp
+##oCA
+##obj
+##ock
+##oll
+##omb
+##ome
+##omm
+##omp
+##one
+##ong
+##ook
+##ork
+##orm
+##ort
+##ory
+##oul
+##oup
+##our
+##ous
+##out
+##ove
+##own
+##ows
+##per
+##phe
+##ply
+##por
+##ppl
+##ppy
+##qqu
+##qua
+##que
+##qui
+##raz
+##rch
+##ric
+##rou
+##son
+##tBI
+##tch
+##ter
+##the
+##tic
+##tim
+##tiv
+##tur
+##uch
+##uck
+##uct
+##uff
+##ugh
+##umb
+##ung
+##ure
+##urn
+##vel
+##ven
+##ver
+##vic
+##vid
+##vin
+##war
+##way
+##whe
+##wor
+##www
+##xxx
+##ymb
+##yth
+##zhe
+##zym
+##zzy
+##ATIO
+##CESS
+##CIAT
+##CTIO
+##CTOR
+##ENGI
+##ERSI
+##HCSD
+##INES
+##INUE
+##IONA
+##LOID
+##MENT
+##NEER
+##NOLO
+##NTER
+##NTSC
+##ORMA
+##OSHO
+##RISE
+##RNAT
+##RNET
+##SATA
+##SION
+##TION
+##TTLE
+##VERS
+##ally
+##arch
+##ayer
+##azer
+##azin
+##bert
+##book
+##chin
+##ctor
+##ding
+##echn
+##erPC
+##erVR
+##eriz
+##erve
+##ever
+##ffer
+##ffff
+##ffic
+##fter
+##ghly
+##hell
+##ical
+##iche
+##icke
+##ific
+##ight
+##iver
+##izon
+##izzy
+##king
+##lack
+##land
+##llow
+##mber
+##ngin
+##ning
+##omic
+##onom
+##othe
+##ouch
+##ough
+##ound
+##ower
+##pper
+##ppin
+##pter
+##ster
+##ther
+##tion
+##tive
+##tter
+##ture
+##urch
+##vely
+##ction
+##ctive
+##enter
+##erica
+##ional
+##thing
diff --git a/fengshen/workspace/erlangshen-deberta-base/pretrain/README.md b/fengshen/workspace/erlangshen-deberta-base/pretrain/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..942a13a2a1a2eca8fe42ecaab03d33b16e4c0700
--- /dev/null
+++ b/fengshen/workspace/erlangshen-deberta-base/pretrain/README.md
@@ -0,0 +1,54 @@
+---
+language: 
+  - zh
+
+license: apache-2.0
+
+tags:
+  - bert
+
+inference: true
+
+widget:
+- text: "生活的真谛是[MASK]。"
+---
+# Erlangshen-Deberta-97M-Chinese，one model of [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM).
+The 97 million parameter deberta-V2 base model, using 180G Chinese data, 24 A100(40G) training for 7 days，which is a encoder-only transformer structure. Consumed totally 1B samples.
+
+
+## Task Description
+
+Erlangshen-Deberta-97M-Chinese is pre-trained by bert like mask task from Deberta [paper](https://readpaper.com/paper/3033187248)
+
+
+## Usage
+```python
+from transformers import AutoModelForMaskedLM, AutoTokenizer, FillMaskPipeline
+import torch
+
+tokenizer=AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese', use_fast=False)
+model=AutoModelForMaskedLM.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese')
+text = '生活的真谛是[MASK]。'
+fillmask_pipe = FillMaskPipeline(model, tokenizer, device=7)
+print(fillmask_pipe(text, top_k=10))
+```
+
+## Finetune
+
+We present the dev results on some tasks.
+
+| Model                              | OCNLI | CMNLI  |
+| ---------------------------------- | ----- | ------ |
+| RoBERTa-base                       | 0.743 | 0.7973 |
+| **Erlangshen-Deberta-97M-Chinese** | 0.752 | 0.807  |
+
+## Citation
+If you find the resource is useful, please cite the following website in your paper.
+```
+@misc{Fengshenbang-LM,
+  title={Fengshenbang-LM},
+  author={IDEA-CCNL},
+  year={2022},
+  howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
+}
+```
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-deberta-base/pretrain/config.json b/fengshen/workspace/erlangshen-deberta-base/pretrain/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00b0a4e997b3dc7008980c27ceb510418144809f
--- /dev/null
+++ b/fengshen/workspace/erlangshen-deberta-base/pretrain/config.json
@@ -0,0 +1,27 @@
+{
+    "model_type": "deberta-v2",
+    "architectures": [
+        "DebertaV2ForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 512,
+    "relative_attention": true,
+    "position_buckets": 256,
+    "norm_rel_ebd": "layer_norm",
+    "share_att_key": true,
+    "pos_att_type": "c2p|p2c",
+    "conv_kernel_size": 3,
+    "conv_act": "gelu",
+    "layer_norm_eps": 1e-7,
+    "max_relative_positions": -1,
+    "position_biased_input": false,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "type_vocab_size": 0,
+    "vocab_size": 12800
+}
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-deberta-base/pretrain/special_tokens_map.json b/fengshen/workspace/erlangshen-deberta-base/pretrain/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7b0375001f109a6b8873d756ad4f7bbb15fbaa5
--- /dev/null
+++ b/fengshen/workspace/erlangshen-deberta-base/pretrain/special_tokens_map.json
@@ -0,0 +1 @@
+{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-deberta-base/pretrain/tokenizer_config.json b/fengshen/workspace/erlangshen-deberta-base/pretrain/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..63c7eb71d0058343534b09eb59770646b9368208
--- /dev/null
+++ b/fengshen/workspace/erlangshen-deberta-base/pretrain/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+    "do_lower_case": true,
+    "do_basic_tokenize": true,
+    "never_split": null,
+    "unk_token": "[UNK]",
+    "sep_token": "[SEP]",
+    "pad_token": "[PAD]",
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]",
+    "tokenize_chinese_chars": true,
+    "strip_accents": null,
+    "special_tokens_map_file": null,
+    "tokenizer_class": "BertTokenizer"
+}
\ No newline at end of file
diff --git a/fengshen/workspace/erlangshen-deberta-base/pretrain/vocab.txt b/fengshen/workspace/erlangshen-deberta-base/pretrain/vocab.txt
new file mode 100644
index 0000000000000000000000000000000000000000..437c75cb0090ba6d449e478fae3f7421ab1de961
--- /dev/null
+++ b/fengshen/workspace/erlangshen-deberta-base/pretrain/vocab.txt
@@ -0,0 +1,12800 @@
+[PAD]
+[CLS]
+[SEP]
+[UNK]
+[MASK]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+`
+{
+|
+}
+~
+·
+–
+—
+‘
+’
+‛
+“
+”
+„
+‟
+…
+‧
+、
+。
+〃
+〈
+〉
+《
+》
+「
+」
+『
+』
+【
+】
+〔
+〕
+〖
+〗
+〜
+〝
+〞
+〰
+﹏
+﹑
+﹔
+！
+＂
+＃
+＄
+％
+＆
+＇
+（
+）
+＊
+＋
+，
+－
+：
+；
+＜
+＝
+＞
+？
+＠
+［
+＼
+］
+＾
+＿
+｀
+｛
+｜
+｝
+～
+｡
+｢
+｣
+､
+0
+##0
+1
+##1
+2
+##2
+3
+##3
+4
+##4
+5
+##5
+6
+##6
+7
+##7
+8
+##8
+9
+##9
+一
+丁
+七
+丄
+丅
+丆
+万
+丈
+三
+上
+下
+丌
+不
+与
+丏
+丐
+丑
+专
+且
+丕
+世
+丘
+丙
+业
+丛
+东
+丝
+丞
+両
+丢
+两
+严
+丧
+丨
+个
+丫
+中
+丰
+串
+临
+丶
+丸
+丹
+为
+主
+丼
+丽
+举
+丿
+乂
+乃
+乄
+久
+乇
+么
+义
+之
+乌
+乍
+乎
+乏
+乐
+乒
+乓
+乔
+乖
+乗
+乘
+乙
+乛
+乜
+九
+乞
+也
+习
+乡
+书
+乩
+买
+乱
+乳
+乸
+乾
+亀
+了
+亇
+予
+争
+亊
+事
+二
+亍
+于
+亏
+云
+互
+亓
+五
+井
+亘
+亚
+些
+亜
+亟
+亡
+亢
+交
+亥
+亦
+产
+亨
+亩
+享
+京
+亭
+亮
+亲
+亳
+亵
+亶
+亸
+亹
+人
+亻
+亽
+亾
+亿
+什
+仁
+仂
+仃
+仄
+仅
+仆
+仇
+仉
+今
+介
+仍
+从
+仏
+仑
+仓
+仔
+仕
+他
+仗
+付
+仙
+仝
+仞
+仟
+仡
+代
+令
+以
+仨
+仩
+仪
+仫
+们
+仮
+仰
+仲
+仳
+仵
+件
+价
+任
+仼
+份
+仿
+企
+伉
+伊
+伋
+伍
+伎
+伏
+伐
+休
+众
+优
+伙
+会
+伛
+伝
+伞
+伟
+传
+伢
+伤
+伥
+伦
+伧
+伪
+伫
+伯
+估
+伱
+伲
+伴
+伶
+伷
+伸
+伺
+似
+伽
+伾
+佀
+佃
+但
+佉
+位
+低
+住
+佐
+佑
+体
+何
+佗
+佘
+余
+佚
+佛
+作
+佝
+佞
+佟
+你
+佢
+佣
+佤
+佥
+佧
+佩
+佬
+佯
+佰
+佳
+佴
+佶
+佷
+佺
+佻
+佼
+佾
+使
+侁
+侂
+侃
+侄
+來
+侈
+侉
+例
+侍
+侏
+侑
+侔
+侗
+侘
+供
+依
+侠
+価
+侣
+侥
+侦
+侧
+侨
+侩
+侪
+侬
+侮
+侯
+侵
+便
+促
+俄
+俅
+俊
+俎
+俏
+俐
+俑
+俗
+俘
+俚
+俛
+俜
+保
+俞
+俟
+信
+俢
+俣
+俤
+俦
+俨
+俩
+俪
+俬
+俭
+修
+俯
+俱
+俳
+俶
+俸
+俺
+俾
+倅
+倌
+倍
+倏
+倒
+倓
+倔
+倕
+倘
+候
+倚
+倜
+倞
+借
+倡
+値
+倥
+倦
+倧
+倨
+倩
+倪
+倬
+倭
+倮
+倶
+债
+倻
+值
+倾
+偁
+偃
+假
+偈
+偌
+偍
+偎
+偏
+偓
+偕
+做
+停
+偠
+健
+偬
+偰
+偲
+偶
+偷
+偻
+偾
+偿
+傀
+傅
+傈
+傉
+傍
+傒
+傕
+傣
+傥
+傧
+储
+傩
+催
+傲
+傺
+傻
+働
+像
+僖
+僚
+僜
+僣
+僦
+僧
+僬
+僭
+僮
+僰
+僳
+僵
+僻
+僾
+儁
+儆
+儇
+儋
+儒
+儞
+儡
+儿
+兀
+允
+元
+兄
+充
+兆
+先
+光
+克
+免
+兎
+児
+兑
+兔
+兕
+兖
+党
+兜
+兢
+入
+全
+八
+公
+六
+兮
+兰
+共
+兲
+关
+兴
+兵
+其
+具
+典
+兹
+养
+兼
+兽
+冀
+内
+円
+冇
+冈
+冉
+册
+再
+冏
+冒
+冕
+冗
+写
+冚
+军
+农
+冠
+冢
+冤
+冥
+冧
+冨
+冬
+冮
+冯
+冰
+冲
+决
+冴
+况
+冶
+冷
+冻
+冼
+冽
+净
+凃
+凄
+准
+凇
+凉
+凊
+凋
+凌
+减
+凑
+凖
+凛
+凝
+几
+凡
+凤
+処
+凪
+凫
+凭
+凯
+凰
+凳
+凶
+凸
+凹
+出
+击
+凼
+函
+凿
+刀
+刁
+刂
+刃
+分
+切
+刈
+刊
+刋
+刍
+刎
+刑
+划
+刖
+列
+刘
+则
+刚
+创
+初
+删
+判
+刨
+利
+别
+刬
+刭
+刮
+到
+刳
+制
+刷
+券
+刹
+刺
+刻
+刽
+刿
+剀
+剁
+剂
+剃
+剅
+削
+剌
+前
+剐
+剑
+剔
+剖
+剜
+剞
+剡
+剣
+剥
+剧
+剩
+剪
+副
+割
+剽
+剿
+劂
+劄
+劈
+劏
+劓
+力
+劝
+办
+功
+加
+务
+劢
+劣
+动
+助
+努
+劫
+劬
+劭
+励
+劲
+劳
+劵
+効
+劻
+劼
+劾
+势
+勃
+勅
+勇
+勉
+勋
+勍
+勐
+勑
+勒
+勔
+勖
+勘
+募
+勠
+勤
+勰
+勲
+勳
+勷
+勺
+勾
+勿
+匀
+匂
+匄
+包
+匆
+匈
+匋
+匍
+匏
+匐
+匕
+化
+北
+匙
+匚
+匜
+匝
+匠
+匡
+匣
+匦
+匪
+匮
+匹
+区
+医
+匾
+匿
+區
+十
+千
+卅
+升
+午
+卉
+半
+卌
+卍
+华
+协
+卐
+卑
+卒
+卓
+单
+卖
+南
+単
+博
+卜
+卝
+卞
+卟
+占
+卡
+卢
+卣
+卤
+卦
+卧
+卨
+卫
+卬
+卮
+卯
+印
+危
+卲
+即
+却
+卵
+卷
+卸
+卺
+卽
+卿
+厂
+厄
+厅
+历
+厉
+压
+厌
+厍
+厓
+厔
+厕
+厘
+厚
+厝
+原
+厢
+厣
+厥
+厦
+厨
+厩
+厮
+厶
+去
+县
+叁
+参
+叆
+又
+叉
+及
+友
+双
+反
+収
+发
+叒
+叔
+叕
+取
+受
+变
+叙
+叛
+叟
+叠
+叡
+口
+古
+句
+另
+叨
+叩
+只
+叫
+召
+叭
+叮
+可
+台
+叱
+史
+右
+叵
+叶
+号
+司
+叹
+叻
+叼
+叽
+吁
+吃
+各
+吅
+吆
+吇
+合
+吉
+吊
+吋
+同
+名
+后
+吏
+吐
+向
+吒
+吓
+吔
+吕
+吖
+吗
+吙
+吚
+君
+吝
+吞
+吟
+吠
+吡
+吥
+否
+吧
+吨
+吩
+含
+听
+吭
+吮
+启
+吱
+吲
+吴
+吵
+吸
+吹
+吻
+吼
+吽
+吾
+吿
+呀
+呃
+呆
+呈
+呉
+告
+呋
+呎
+呐
+呑
+呒
+呓
+呔
+呕
+呖
+呗
+员
+呙
+呛
+呜
+呢
+呣
+呤
+呦
+周
+呪
+呬
+呯
+呱
+呲
+味
+呴
+呵
+呶
+呷
+呸
+呻
+呼
+命
+呾
+咀
+咁
+咂
+咄
+咆
+咋
+和
+咎
+咏
+咐
+咒
+咔
+咕
+咖
+咗
+咘
+咙
+咚
+咛
+咝
+咢
+咣
+咤
+咥
+咦
+咧
+咨
+咩
+咪
+咫
+咬
+咭
+咯
+咱
+咲
+咳
+咴
+咸
+咻
+咽
+咾
+咿
+哀
+品
+哂
+哄
+哆
+哇
+哈
+哉
+哋
+哌
+响
+哎
+哏
+哐
+哑
+哒
+哓
+哔
+哕
+哗
+哙
+哚
+哝
+哞
+哟
+哥
+哦
+哧
+哨
+哩
+哪
+哭
+哮
+哲
+哺
+哼
+哽
+哿
+唁
+唃
+唆
+唇
+唉
+唎
+唏
+唐
+唑
+唔
+唛
+唠
+唢
+唤
+唦
+唧
+唬
+售
+唯
+唰
+唱
+唳
+唵
+唷
+唻
+唼
+唾
+唿
+啁
+啃
+啄
+啅
+商
+啉
+啊
+啐
+啓
+啕
+啖
+啜
+啝
+啡
+啤
+啥
+啦
+啧
+啪
+啫
+啬
+啭
+啮
+啯
+啰
+啱
+啲
+啵
+啶
+啷
+啸
+啻
+啼
+啾
+喀
+喁
+喂
+喃
+善
+喆
+喇
+喈
+喉
+喊
+喋
+喎
+喏
+喑
+喔
+喘
+喙
+喜
+喝
+喟
+喦
+喧
+喰
+喱
+喳
+喵
+営
+喷
+喹
+喺
+喻
+喽
+喾
+嗄
+嗅
+嗉
+嗌
+嗍
+嗑
+嗒
+嗓
+嗔
+嗖
+嗜
+嗝
+嗞
+嗟
+嗡
+嗣
+嗤
+嗥
+嗦
+嗨
+嗪
+嗫
+嗬
+嗮
+嗯
+嗰
+嗲
+嗳
+嗵
+嗷
+嗽
+嗾
+嘀
+嘁
+嘅
+嘈
+嘉
+嘌
+嘎
+嘏
+嘘
+嘚
+嘛
+嘞
+嘟
+嘢
+嘣
+嘤
+嘦
+嘧
+嘬
+嘭
+嘱
+嘲
+嘴
+嘶
+嘹
+嘻
+嘿
+噁
+噉
+噌
+噎
+噐
+噔
+噗
+噘
+噙
+噜
+噢
+噤
+器
+噩
+噪
+噫
+噬
+噱
+噶
+噻
+噼
+嚅
+嚈
+嚎
+嚏
+嚐
+嚒
+嚓
+嚟
+嚣
+嚧
+嚩
+嚭
+嚯
+嚷
+嚼
+囊
+囍
+囔
+囖
+囗
+囘
+囚
+四
+囝
+回
+囟
+因
+囡
+团
+団
+囤
+囧
+囫
+囬
+园
+囯
+困
+囱
+囲
+図
+围
+囵
+囷
+囹
+固
+国
+图
+囿
+圃
+圄
+圆
+圈
+圉
+圊
+國
+圌
+圏
+圜
+圝
+圞
+土
+圣
+圧
+在
+圩
+圪
+圬
+圭
+圮
+圯
+地
+圳
+圹
+场
+圻
+圾
+址
+坂
+均
+坊
+坌
+坍
+坎
+坏
+坐
+坑
+块
+坚
+坛
+坜
+坝
+坞
+坟
+坠
+坡
+坤
+坦
+坨
+坩
+坪
+坫
+坬
+坭
+坯
+坳
+坷
+坻
+坼
+垂
+垃
+垄
+垅
+垆
+型
+垌
+垍
+垒
+垓
+垕
+垚
+垛
+垞
+垟
+垠
+垡
+垢
+垣
+垤
+垦
+垧
+垩
+垫
+垭
+垮
+垱
+垲
+垴
+垵
+垸
+埂
+埃
+埇
+埈
+埋
+埌
+城
+埏
+埒
+埔
+埕
+埗
+埘
+埙
+埚
+埜
+埝
+域
+埠
+埤
+埭
+埯
+埴
+埵
+埸
+培
+基
+埼
+埽
+堀
+堂
+堃
+堆
+堇
+堈
+堉
+堋
+堌
+堍
+堑
+堕
+堙
+堞
+堠
+堡
+堤
+堨
+堪
+堰
+堵
+堺
+堽
+塁
+塄
+塅
+塆
+塌
+塍
+塑
+塔
+塘
+塚
+塝
+塞
+塩
+填
+塬
+塭
+塱
+塽
+塾
+墀
+墁
+境
+墅
+墉
+墒
+墓
+墕
+増
+墘
+墙
+增
+墟
+墨
+墩
+墫
+壁
+壅
+壆
+壊
+壑
+壕
+壤
+士
+壬
+壮
+声
+壱
+売
+壳
+壶
+壸
+壹
+处
+备
+変
+复
+夏
+夑
+夔
+夕
+外
+夙
+多
+夜
+够
+夤
+夥
+大
+夨
+天
+太
+夫
+夬
+夭
+央
+夯
+失
+夲
+头
+夶
+夷
+夸
+夹
+夺
+夼
+奀
+奁
+奂
+奄
+奇
+奈
+奉
+奋
+奌
+奎
+奏
+契
+奔
+奕
+奖
+套
+奘
+奚
+奠
+奢
+奥
+奨
+奭
+女
+奴
+奶
+奸
+她
+好
+妁
+如
+妃
+妄
+妆
+妇
+妈
+妊
+妍
+妒
+妓
+妖
+妗
+妘
+妙
+妞
+妠
+妣
+妤
+妥
+妨
+妩
+妪
+妫
+妬
+妮
+妯
+妲
+妹
+妺
+妻
+妼
+妾
+姁
+姆
+姈
+姉
+姊
+始
+姐
+姑
+姒
+姓
+委
+姗
+姘
+姚
+姜
+姝
+姞
+姣
+姤
+姥
+姨
+姫
+姬
+姮
+姱
+姵
+姹
+姻
+姽
+姿
+威
+娃
+娄
+娅
+娆
+娇
+娈
+娉
+娌
+娑
+娓
+娘
+娜
+娟
+娠
+娡
+娣
+娥
+娩
+娭
+娱
+娲
+娴
+娶
+娼
+婀
+婆
+婉
+婊
+婕
+婚
+婠
+婢
+婧
+婪
+婬
+婳
+婴
+婵
+婶
+婷
+婺
+婻
+婼
+婿
+媄
+媒
+媗
+媚
+媛
+媜
+媞
+媪
+媲
+媳
+媵
+媸
+媺
+媾
+嫁
+嫂
+嫄
+嫉
+嫌
+嫒
+嫔
+嫖
+嫘
+嫚
+嫠
+嫡
+嫣
+嫦
+嫩
+嫪
+嫫
+嫰
+嫱
+嫲
+嫽
+嬅
+嬉
+嬖
+嬗
+嬛
+嬜
+嬢
+嬲
+嬴
+嬷
+嬾
+嬿
+孀
+子
+孑
+孒
+孓
+孔
+孕
+孖
+字
+存
+孙
+孚
+孛
+孜
+孝
+孟
+孢
+季
+孤
+孥
+学
+孩
+孪
+孬
+孰
+孱
+孳
+孵
+孺
+孽
+宀
+宁
+它
+宄
+宅
+宇
+守
+安
+宋
+完
+宍
+宏
+宓
+宕
+宗
+官
+宙
+定
+宛
+宜
+宝
+实
+実
+宠
+审
+客
+宣
+室
+宥
+宦
+宪
+宫
+宬
+宰
+害
+宴
+宵
+家
+宸
+容
+宽
+宾
+宿
+寀
+寂
+寃
+寄
+寅
+密
+寇
+富
+寐
+寒
+寓
+寔
+寘
+寛
+寝
+寞
+察
+寡
+寤
+寥
+寨
+寮
+寯
+寰
+寳
+寸
+对
+寺
+寻
+导
+対
+寿
+封
+専
+射
+尅
+将
+尉
+尊
+對
+小
+尐
+少
+尒
+尓
+尔
+尕
+尖
+尘
+尙
+尚
+尛
+尜
+尝
+尢
+尤
+尧
+尨
+尪
+尬
+就
+尴
+尸
+尹
+尺
+尻
+尼
+尽
+尾
+尿
+局
+屁
+层
+屃
+屄
+居
+屈
+屉
+届
+屋
+屌
+屍
+屎
+屏
+屐
+屑
+展
+屙
+属
+屠
+屡
+屣
+履
+屦
+屮
+屯
+山
+屹
+屺
+屾
+屿
+岀
+岁
+岂
+岈
+岌
+岐
+岑
+岔
+岕
+岖
+岗
+岘
+岙
+岚
+岛
+岜
+岞
+岢
+岣
+岩
+岫
+岬
+岭
+岱
+岳
+岵
+岷
+岸
+岺
+岽
+岿
+峁
+峄
+峇
+峋
+峒
+峕
+峙
+峠
+峡
+峣
+峤
+峥
+峦
+峨
+峩
+峪
+峭
+峯
+峰
+峻
+崀
+崁
+崂
+崃
+崄
+崆
+崇
+崎
+崐
+崑
+崔
+崖
+崚
+崛
+崞
+崟
+崤
+崦
+崧
+崩
+崭
+崮
+崴
+崽
+崾
+嵇
+嵊
+嵋
+嵌
+嵎
+嵖
+嵗
+嵘
+嵚
+嵛
+嵝
+嵩
+嵬
+嵯
+嵴
+嶂
+嶋
+嶙
+嶝
+嶲
+嶷
+巂
+巅
+巇
+巉
+巍
+巎
+巘
+巜
+川
+州
+巡
+巢
+巣
+工
+左
+巧
+巨
+巩
+巫
+差
+巯
+己
+已
+巳
+巴
+巷
+巻
+巽
+巾
+巿
+币
+市
+布
+帅
+帆
+师
+希
+帏
+帐
+帑
+帔
+帕
+帖
+帘
+帙
+帚
+帛
+帜
+帝
+带
+帧
+席
+帮
+帯
+帰
+帷
+常
+帻
+帼
+帽
+幂
+幄
+幅
+幌
+幔
+幕
+幛
+幞
+幡
+幢
+干
+平
+年
+幵
+并
+幷
+幸
+幺
+幻
+幼
+幽
+广
+庀
+庁
+広
+庄
+庆
+庇
+床
+庋
+序
+庐
+庑
+库
+应
+底
+庖
+店
+庙
+庚
+府
+庞
+废
+庠
+庤
+庥
+度
+座
+庭
+庵
+庶
+康
+庸
+庹
+庾
+廆
+廉
+廊
+廋
+廌
+廑
+廒
+廓
+廕
+廖
+廙
+廛
+廞
+廨
+廪
+廯
+延
+廷
+建
+廻
+廼
+廾
+廿
+开
+弁
+异
+弃
+弄
+弇
+弈
+弊
+弋
+式
+弐
+弑
+弓
+引
+弗
+弘
+弛
+弟
+张
+弢
+弥
+弦
+弧
+弩
+弭
+弯
+弱
+弹
+强
+弼
+弾
+彀
+归
+当
+录
+彖
+彗
+彘
+彝
+彟
+彡
+形
+彤
+彦
+彧
+彩
+彪
+彬
+彭
+彰
+影
+彳
+彵
+彷
+役
+彻
+彼
+往
+征
+徂
+径
+待
+徇
+很
+徉
+徊
+律
+徐
+徒
+従
+徕
+得
+徘
+徙
+徜
+御
+徧
+徨
+循
+徭
+微
+徳
+徴
+徵
+德
+徼
+徽
+心
+忄
+必
+忆
+忉
+忌
+忍
+忏
+忐
+忑
+忒
+忖
+志
+忘
+忙
+応
+忝
+忞
+忠
+忡
+忤
+忧
+忪
+快
+忭
+忱
+念
+忸
+忻
+忽
+忾
+忿
+怀
+态
+怂
+怃
+怄
+怅
+怆
+怍
+怎
+怏
+怒
+怔
+怕
+怖
+怙
+怛
+怜
+思
+怠
+怡
+急
+怦
+性
+怨
+怩
+怪
+怫
+怯
+怱
+怳
+怵
+怹
+总
+怼
+怿
+恁
+恂
+恃
+恋
+恍
+恏
+恐
+恒
+恕
+恙
+恚
+恠
+恢
+恣
+恤
+恨
+恩
+恪
+恫
+恬
+恭
+息
+恰
+恳
+恵
+恶
+恸
+恹
+恺
+恻
+恼
+恽
+恿
+悃
+悄
+悉
+悌
+悍
+悒
+悔
+悖
+悚
+悛
+悝
+悟
+悠
+患
+悦
+您
+悩
+悪
+悫
+悬
+悭
+悯
+悰
+悱
+悲
+悳
+悴
+悸
+悻
+悼
+情
+惆
+惇
+惊
+惋
+惑
+惔
+惕
+惘
+惚
+惛
+惜
+惝
+惟
+惠
+惢
+惣
+惦
+惧
+惨
+惩
+惪
+惫
+惬
+惭
+惮
+惯
+惰
+想
+惴
+惶
+惹
+惺
+愀
+愁
+愆
+愈
+愉
+愍
+愎
+意
+愔
+愕
+愚
+感
+愠
+愣
+愤
+愦
+愧
+愫
+愬
+愰
+愽
+愿
+慆
+慈
+慊
+慌
+慎
+慑
+慕
+慜
+慝
+慢
+慥
+慧
+慨
+慰
+慵
+慷
+慾
+憋
+憍
+憎
+憔
+憙
+憧
+憨
+憩
+憬
+憷
+憺
+憾
+懂
+懈
+懊
+懋
+懐
+懑
+懒
+懔
+懦
+懮
+懵
+懽
+懿
+戆
+戈
+戊
+戋
+戌
+戍
+戎
+戏
+成
+我
+戒
+戓
+戕
+或
+戗
+战
+戚
+戛
+戟
+戡
+戢
+戥
+戦
+截
+戬
+戮
+戯
+戳
+戴
+户
+戻
+戽
+戾
+房
+所
+扁
+扃
+扆
+扇
+扈
+扉
+手
+扌
+才
+扎
+扑
+扒
+打
+扔
+托
+扛
+扞
+扣
+扥
+扦
+执
+扩
+扪
+扫
+扬
+扭
+扮
+扯
+扰
+扳
+扶
+批
+扼
+扽
+找
+承
+技
+抃
+抄
+抉
+把
+抑
+抒
+抓
+抔
+投
+抖
+抗
+折
+抚
+抛
+抜
+抟
+抠
+抡
+抢
+护
+报
+抧
+抨
+披
+抬
+抱
+抳
+抵
+抹
+抺
+抻
+押
+抽
+抿
+拂
+拄
+担
+拆
+拇
+拈
+拉
+拊
+拌
+拍
+拎
+拏
+拐
+拒
+拓
+拔
+拖
+拗
+拘
+拙
+招
+拜
+拟
+拢
+拣
+拥
+拦
+拧
+拨
+择
+括
+拭
+拮
+拯
+拱
+拳
+拴
+拶
+拷
+拼
+拽
+拾
+拿
+挀
+持
+挂
+指
+挈
+按
+挎
+挑
+挒
+挖
+挚
+挛
+挝
+挞
+挟
+挠
+挡
+挢
+挣
+挤
+挥
+挨
+挪
+挫
+振
+挲
+挹
+挺
+挻
+挼
+挽
+捂
+捃
+捅
+捆
+捉
+捋
+捌
+捍
+捎
+捏
+捐
+捕
+捜
+捞
+损
+捡
+换
+捣
+捧
+捩
+捭
+据
+捯
+捱
+捶
+捷
+捺
+捻
+捽
+掀
+掂
+掇
+授
+掉
+掊
+掌
+掎
+掏
+掐
+排
+掖
+掘
+掞
+掠
+探
+掣
+掤
+接
+控
+推
+掩
+措
+掬
+掭
+掮
+掰
+掲
+掳
+掴
+掷
+掸
+掺
+掼
+掾
+揄
+揆
+揉
+揍
+描
+提
+插
+揖
+揠
+握
+揣
+揩
+揪
+揭
+揲
+援
+揵
+揶
+揸
+揺
+揽
+揾
+揿
+搀
+搁
+搂
+搅
+搋
+搏
+搐
+搓
+搔
+搜
+搞
+搠
+搡
+搢
+搥
+搦
+搧
+搨
+搪
+搬
+搭
+搴
+搵
+携
+搽
+搿
+摁
+摄
+摅
+摆
+摇
+摈
+摊
+摒
+摔
+摘
+摛
+摞
+摧
+摩
+摭
+摸
+摹
+摺
+摽
+撂
+撃
+撄
+撅
+撇
+撑
+撒
+撕
+撘
+撙
+撝
+撞
+撤
+撩
+撬
+播
+撮
+撰
+撵
+撷
+撸
+撺
+撼
+擀
+擂
+擅
+操
+擎
+擒
+擗
+擘
+擞
+擢
+擤
+擦
+擫
+擿
+攀
+攒
+攘
+攞
+攥
+攫
+支
+攲
+攴
+攵
+收
+攸
+改
+攻
+攽
+放
+政
+故
+效
+敉
+敌
+敎
+敏
+救
+敔
+敕
+敖
+教
+敚
+敛
+敝
+敞
+敢
+散
+敦
+敫
+敬
+数
+敲
+整
+敷
+敻
+文
+斉
+斋
+斌
+斎
+斐
+斑
+斓
+斗
+料
+斛
+斜
+斝
+斟
+斡
+斤
+斥
+斧
+斩
+斫
+断
+斯
+新
+斲
+斶
+方
+於
+施
+旁
+旃
+旄
+旅
+旆
+旋
+旌
+旎
+族
+旒
+旖
+旗
+旛
+无
+既
+旣
+日
+旦
+旧
+旨
+早
+旬
+旭
+旮
+旯
+旰
+旱
+旳
+旴
+时
+旷
+旸
+旺
+旻
+旼
+昀
+昂
+昃
+昆
+昇
+昉
+昊
+昌
+明
+昏
+易
+昔
+昕
+昙
+昚
+昝
+昞
+星
+映
+春
+昧
+昨
+昪
+昫
+昭
+是
+昰
+昱
+昳
+昴
+昵
+昶
+昺
+昼
+显
+晁
+時
+晃
+晄
+晋
+晌
+晏
+晒
+晓
+晔
+晕
+晖
+晗
+晙
+晚
+晞
+晟
+晡
+晢
+晤
+晦
+晧
+晨
+晩
+晬
+普
+景
+晰
+晳
+晴
+晶
+晷
+晸
+智
+晻
+晾
+暁
+暂
+暄
+暇
+暌
+暍
+暎
+暐
+暑
+暕
+暖
+暗
+暝
+暠
+暧
+暨
+暮
+暴
+暸
+暹
+暻
+暾
+曈
+曌
+曕
+曙
+曛
+曜
+曝
+曦
+曩
+曰
+曱
+曲
+曳
+更
+曷
+曹
+曺
+曼
+曽
+曾
+替
+最
+會
+朅
+月
+有
+朊
+朋
+服
+朏
+朐
+朓
+朔
+朕
+朗
+望
+朝
+期
+朥
+朦
+木
+未
+末
+本
+札
+术
+朱
+朲
+朴
+朵
+机
+朽
+朿
+杀
+杂
+权
+杆
+杈
+杉
+杌
+李
+杏
+材
+村
+杓
+杖
+杜
+杞
+束
+杠
+条
+来
+杧
+杨
+杪
+杭
+杮
+杯
+杰
+杲
+杳
+杵
+杷
+杼
+松
+板
+极
+构
+枇
+枉
+枋
+枏
+析
+枓
+枕
+林
+枘
+枚
+果
+枝
+枞
+枟
+枠
+枢
+枣
+枥
+枧
+枨
+枪
+枫
+枭
+枯
+枰
+枱
+枳
+枵
+架
+枷
+枸
+枹
+柁
+柃
+柄
+柊
+柏
+某
+柑
+柒
+染
+柔
+柘
+柙
+柚
+柜
+柝
+柞
+柟
+柠
+柢
+查
+柩
+柬
+柯
+柰
+柱
+柳
+柴
+柷
+査
+柽
+柾
+柿
+栀
+栃
+栄
+栅
+标
+栈
+栉
+栊
+栋
+栌
+栎
+栏
+树
+栒
+栓
+栖
+栗
+栝
+栞
+栟
+校
+栢
+栩
+株
+栱
+栲
+栳
+栴
+样
+核
+根
+栻
+格
+栽
+栾
+栿
+桀
+桁
+桂
+桃
+桄
+桅
+框
+案
+桉
+桌
+桎
+桐
+桑
+桓
+桔
+桕
+桖
+桜
+桠
+桡
+桢
+档
+桤
+桥
+桦
+桧
+桨
+桩
+桫
+桴
+桶
+桷
+梁
+梃
+梅
+梆
+梏
+梓
+梗
+梢
+梣
+梦
+梧
+梨
+梭
+梯
+械
+梳
+梵
+梶
+梼
+梿
+检
+棂
+棉
+棋
+棍
+棐
+棒
+棕
+棘
+棚
+棠
+棣
+棨
+棪
+棫
+森
+棰
+棱
+棵
+棹
+棺
+棻
+棼
+椀
+椁
+椅
+椇
+椋
+植
+椎
+椐
+椒
+椛
+検
+椟
+椠
+椤
+椪
+椭
+椰
+椴
+椹
+椽
+椿
+楂
+楔
+楗
+楙
+楚
+楛
+楝
+楞
+楠
+楢
+楣
+楤
+楦
+楩
+楪
+楫
+業
+楮
+楯
+楶
+楷
+楸
+楹
+楼
+楽
+榀
+概
+榃
+榄
+榆
+榇
+榈
+榉
+榊
+榎
+榔
+榕
+榖
+榘
+榛
+榜
+榧
+榨
+榫
+榭
+榱
+榴
+榷
+榻
+榼
+槁
+槃
+槅
+槊
+槌
+槎
+槐
+槑
+槔
+様
+槙
+槚
+槛
+槜
+槟
+槠
+槭
+槱
+槲
+槵
+槻
+槽
+槿
+樊
+樋
+樗
+樘
+樛
+樟
+模
+樨
+権
+横
+樫
+樯
+樱
+樵
+樽
+樾
+橄
+橇
+橐
+橘
+橙
+橚
+橛
+橡
+橥
+橦
+橱
+橹
+橼
+檀
+檄
+檇
+檎
+檐
+檗
+檞
+檠
+檩
+檫
+檬
+檵
+櫂
+櫆
+欠
+次
+欢
+欣
+欤
+欧
+欲
+欷
+欸
+欹
+欺
+欻
+款
+歃
+歆
+歇
+歉
+歌
+歔
+歘
+歙
+止
+正
+此
+步
+武
+歧
+歩
+歪
+歳
+歴
+歹
+歺
+死
+歼
+殁
+殂
+殃
+殄
+殆
+殇
+殉
+殊
+残
+殍
+殑
+殒
+殓
+殖
+殚
+殛
+殡
+殢
+殪
+殳
+殴
+段
+殷
+殿
+毁
+毂
+毅
+毋
+毌
+母
+毎
+每
+毐
+毑
+毒
+毓
+比
+毕
+毖
+毗
+毘
+毙
+毛
+毡
+毫
+毯
+毳
+毵
+毽
+氀
+氂
+氅
+氆
+氇
+氍
+氏
+氐
+民
+氓
+气
+氖
+気
+氘
+氙
+氚
+氛
+氟
+氡
+氢
+氤
+氦
+氧
+氨
+氩
+氪
+氮
+氯
+氰
+氲
+水
+氵
+氷
+永
+氹
+氺
+氽
+氿
+汀
+汁
+求
+汆
+汇
+汉
+汊
+汐
+汕
+汖
+汗
+汛
+汜
+汝
+汞
+江
+池
+污
+汤
+汧
+汨
+汩
+汪
+汭
+汰
+汲
+汴
+汶
+汸
+汹
+汽
+汾
+沁
+沂
+沃
+沄
+沅
+沆
+沇
+沈
+沉
+沌
+沏
+沐
+沒
+沓
+沔
+沕
+沙
+沚
+沛
+沟
+没
+沢
+沣
+沤
+沥
+沦
+沧
+沨
+沩
+沪
+沫
+沬
+沭
+沮
+沱
+河
+沴
+沵
+沸
+油
+治
+沼
+沽
+沾
+沿
+泃
+泄
+泅
+泇
+泉
+泊
+泌
+泐
+泓
+泔
+法
+泖
+泗
+泚
+泛
+泞
+泠
+泡
+波
+泣
+泥
+注
+泩
+泪
+泫
+泮
+泯
+泰
+泱
+泳
+泵
+泷
+泸
+泺
+泻
+泼
+泽
+泾
+洁
+洄
+洇
+洈
+洊
+洋
+洌
+洎
+洑
+洒
+洗
+洙
+洛
+洞
+洢
+洣
+津
+洧
+洨
+洪
+洫
+洮
+洱
+洲
+洳
+洵
+洸
+洹
+洺
+活
+洼
+洽
+派
+流
+浃
+浄
+浅
+浆
+浇
+浈
+浉
+浊
+测
+浍
+济
+浏
+浐
+浑
+浒
+浓
+浔
+浙
+浚
+浛
+浜
+浞
+浠
+浡
+浣
+浥
+浦
+浩
+浪
+浬
+浮
+浯
+浴
+海
+浸
+浼
+涂
+涅
+消
+涉
+涌
+涎
+涐
+涑
+涓
+涔
+涕
+涘
+涙
+涛
+涝
+涞
+涟
+涠
+涡
+涢
+涣
+涤
+润
+涧
+涨
+涩
+涪
+涫
+涮
+涯
+液
+涴
+涵
+涸
+涿
+淀
+淄
+淅
+淆
+淇
+淋
+淌
+淏
+淑
+淖
+淘
+淙
+淛
+淝
+淞
+淠
+淡
+淤
+淦
+淩
+淫
+淬
+淮
+淯
+深
+淳
+混
+淸
+淹
+添
+淼
+渀
+渃
+清
+済
+渉
+渊
+渋
+渌
+渍
+渎
+渐
+渑
+渔
+渕
+渖
+渗
+渚
+渝
+渟
+渠
+渡
+渣
+渤
+渥
+温
+渫
+渭
+港
+渲
+渴
+游
+渺
+渼
+湃
+湄
+湉
+湋
+湍
+湎
+湑
+湓
+湔
+湖
+湘
+湛
+湜
+湟
+湣
+湫
+湮
+湲
+湳
+湴
+湾
+湿
+満
+溁
+溃
+溅
+溆
+溇
+溉
+溍
+溏
+源
+溘
+溜
+溞
+溟
+溢
+溥
+溦
+溧
+溪
+溯
+溱
+溲
+溴
+溶
+溷
+溺
+溽
+滁
+滂
+滃
+滆
+滇
+滈
+滉
+滋
+滍
+滏
+滑
+滓
+滔
+滕
+滗
+滘
+滙
+滚
+滝
+滞
+滟
+滠
+满
+滢
+滤
+滥
+滦
+滨
+滩
+滴
+滹
+漀
+漂
+漆
+漈
+漉
+漏
+漓
+演
+漕
+漠
+漩
+漪
+漫
+漭
+漯
+漱
+漳
+漶
+漷
+漾
+潆
+潇
+潋
+潍
+潏
+潘
+潜
+潞
+潟
+潢
+潦
+潭
+潮
+潲
+潴
+潸
+潺
+潼
+潽
+潾
+澂
+澄
+澈
+澉
+澌
+澍
+澎
+澐
+澔
+澜
+澡
+澥
+澧
+澪
+澳
+澶
+澹
+激
+濂
+濆
+濉
+濊
+濑
+濒
+濙
+濛
+濞
+濠
+濡
+濩
+濬
+濮
+濯
+瀀
+瀍
+瀑
+瀚
+瀛
+瀞
+瀣
+瀬
+瀹
+瀼
+灌
+灏
+灞
+火
+灬
+灭
+灯
+灰
+灵
+灶
+灸
+灼
+灾
+灿
+炀
+炁
+炅
+炆
+炉
+炊
+炎
+炒
+炔
+炕
+炖
+炘
+炙
+炜
+炝
+炟
+炤
+炩
+炫
+炬
+炭
+炮
+炯
+炱
+炳
+炷
+炸
+点
+為
+炻
+炼
+炽
+烀
+烁
+烂
+烃
+烈
+烊
+烎
+烔
+烘
+烙
+烛
+烜
+烝
+烟
+烤
+烦
+烧
+烨
+烩
+烫
+烬
+热
+烯
+烷
+烹
+烺
+烽
+焉
+焊
+焌
+焐
+焓
+焕
+焖
+焗
+焘
+焙
+焚
+焜
+焞
+焦
+焮
+焯
+焰
+焱
+然
+焼
+煅
+煇
+煊
+煌
+煎
+煐
+煕
+煖
+煚
+煜
+煞
+煤
+煦
+照
+煨
+煮
+煲
+煳
+煴
+煸
+煺
+煽
+熄
+熇
+熊
+熏
+熔
+熘
+熙
+熜
+熟
+熠
+熥
+熨
+熬
+熳
+熵
+熹
+熺
+燃
+燊
+燋
+燎
+燏
+燔
+燕
+燚
+燠
+燥
+燧
+燮
+燹
+燻
+燿
+爀
+爆
+爇
+爨
+爪
+爬
+爰
+爱
+爲
+爵
+父
+爷
+爸
+爹
+爻
+爽
+爿
+牀
+牁
+牂
+片
+版
+牋
+牌
+牍
+牐
+牒
+牖
+牙
+牛
+牝
+牟
+牠
+牡
+牢
+牤
+牦
+牧
+物
+牯
+牲
+牵
+特
+牺
+牻
+牾
+犀
+犁
+犄
+犇
+犊
+犍
+犏
+犒
+犟
+犨
+犬
+犭
+犯
+犰
+犴
+状
+犷
+犸
+犹
+犼
+犽
+狁
+狂
+狃
+狄
+狈
+狌
+狍
+狎
+狐
+狒
+狗
+狙
+狛
+狝
+狞
+狠
+狡
+狨
+狩
+独
+狭
+狮
+狯
+狰
+狱
+狲
+狳
+狴
+狷
+狸
+狻
+狼
+猀
+猁
+猃
+猄
+猇
+猊
+猋
+猎
+猓
+猕
+猖
+猗
+猛
+猜
+猝
+猞
+猟
+猡
+猢
+猥
+猩
+猪
+猫
+猬
+献
+猰
+猱
+猴
+猷
+猹
+猾
+猿
+獍
+獐
+獒
+獗
+獠
+獣
+獬
+獭
+獴
+獾
+玁
+玄
+率
+玉
+王
+玎
+玏
+玑
+玕
+玖
+玗
+玘
+玙
+玚
+玛
+玟
+玠
+玡
+玢
+玥
+玦
+玧
+玩
+玫
+玭
+玮
+环
+现
+玲
+玳
+玷
+玹
+玺
+玻
+珀
+珂
+珅
+珈
+珉
+珊
+珍
+珏
+珐
+珑
+珖
+珙
+珝
+珞
+珠
+珣
+珥
+珦
+珧
+珩
+珪
+班
+珰
+珲
+珵
+珹
+珺
+珽
+琀
+球
+琅
+理
+琇
+琉
+琊
+琋
+琍
+琎
+琏
+琐
+琚
+琛
+琢
+琤
+琥
+琦
+琨
+琪
+琬
+琮
+琯
+琰
+琲
+琳
+琴
+琵
+琶
+琹
+琼
+瑀
+瑁
+瑄
+瑆
+瑊
+瑒
+瑕
+瑗
+瑙
+瑚
+瑛
+瑜
+瑞
+瑟
+瑠
+瑢
+瑧
+瑨
+瑭
+瑰
+瑱
+瑶
+瑷
+瑸
+瑺
+瑾
+璀
+璁
+璂
+璃
+璆
+璇
+璈
+璋
+璎
+璐
+璘
+璜
+璞
+璟
+璠
+璧
+璨
+璩
+璪
+璮
+璲
+璺
+璿
+瓌
+瓒
+瓘
+瓛
+瓜
+瓞
+瓟
+瓠
+瓢
+瓣
+瓤
+瓦
+瓮
+瓯
+瓴
+瓶
+瓷
+瓿
+甃
+甄
+甍
+甏
+甑
+甓
+甗
+甘
+甙
+甚
+甜
+生
+甡
+產
+甥
+用
+甩
+甪
+甫
+甬
+甭
+甯
+田
+由
+甲
+申
+电
+男
+甸
+町
+画
+甾
+畀
+畅
+畈
+畊
+畋
+界
+畎
+畏
+畑
+畔
+留
+畚
+畛
+畜
+畠
+畤
+略
+畦
+番
+畯
+畲
+畴
+畸
+畹
+畿
+疃
+疆
+疋
+疍
+疎
+疏
+疑
+疒
+疔
+疖
+疗
+疙
+疚
+疝
+疟
+疠
+疡
+疣
+疤
+疥
+疫
+疬
+疮
+疯
+疰
+疱
+疲
+疳
+疴
+疵
+疸
+疹
+疼
+疽
+疾
+痂
+痄
+病
+症
+痈
+痉
+痊
+痍
+痒
+痔
+痕
+痖
+痘
+痛
+痞
+痢
+痣
+痤
+痦
+痧
+痨
+痩
+痪
+痫
+痰
+痱
+痴
+痹
+痼
+痿
+瘀
+瘁
+瘅
+瘆
+瘊
+瘌
+瘐
+瘕
+瘖
+瘗
+瘘
+瘙
+瘛
+瘟
+瘠
+瘢
+瘤
+瘥
+瘦
+瘩
+瘪
+瘫
+瘰
+瘳
+瘴
+瘵
+瘸
+瘼
+瘾
+瘿
+癀
+癃
+癌
+癍
+癎
+癒
+癔
+癖
+癜
+癞
+癣
+癫
+癯
+癸
+発
+登
+發
+白
+百
+癿
+皂
+的
+皆
+皇
+皈
+皋
+皎
+皐
+皑
+皒
+皓
+皕
+皖
+皙
+皛
+皝
+皞
+皤
+皦
+皮
+皱
+皲
+皴
+皿
+盂
+盃
+盅
+盆
+盈
+盉
+益
+盌
+盍
+盎
+盏
+盐
+监
+盒
+盔
+盖
+盗
+盘
+盛
+盝
+盟
+盥
+盦
+盨
+盩
+目
+盯
+盱
+盲
+直
+相
+盹
+盼
+盾
+眀
+省
+眄
+眇
+眈
+眉
+看
+県
+眙
+眚
+眛
+眞
+真
+眠
+眦
+眨
+眩
+眬
+眭
+眯
+眵
+眶
+眷
+眸
+眺
+眼
+着
+睁
+睇
+睐
+睑
+睒
+睚
+睛
+睡
+睢
+督
+睥
+睦
+睨
+睪
+睫
+睬
+睱
+睹
+睺
+睽
+睾
+睿
+瞀
+瞄
+瞅
+瞋
+瞌
+瞎
+瞑
+瞒
+瞟
+瞠
+瞢
+瞥
+瞧
+瞩
+瞪
+瞬
+瞭
+瞰
+瞳
+瞻
+瞽
+瞿
+矍
+矗
+矛
+矜
+矞
+矢
+矣
+知
+矧
+矩
+矫
+矬
+短
+矮
+石
+矶
+矸
+矽
+矾
+矿
+砀
+码
+砂
+砉
+砌
+砍
+砑
+砒
+研
+砕
+砖
+砗
+砚
+砜
+砝
+砟
+砢
+砣
+砥
+砦
+砧
+砩
+砬
+砭
+砰
+砳
+破
+砵
+砷
+砸
+砹
+砺
+砻
+砼
+砾
+础
+硅
+硇
+硌
+硎
+硏
+硐
+硒
+硔
+硕
+硖
+硗
+硙
+硚
+硝
+硪
+硫
+硬
+确
+硷
+硼
+碁
+碇
+碉
+碌
+碍
+碎
+碏
+碑
+碓
+碗
+碘
+碚
+碛
+碜
+碟
+碡
+碣
+碥
+碧
+碰
+碱
+碲
+碳
+碴
+碶
+碹
+碾
+磁
+磅
+磉
+磊
+磋
+磐
+磔
+磕
+磙
+磜
+磡
+磦
+磨
+磬
+磲
+磴
+磷
+磺
+磻
+磾
+礁
+礅
+礐
+礓
+礞
+礤
+礴
+示
+礻
+礼
+礽
+社
+祀
+祁
+祂
+祆
+祇
+祈
+祉
+祊
+祎
+祏
+祐
+祓
+祔
+祖
+祗
+祘
+祚
+祛
+祜
+祝
+神
+祟
+祠
+祢
+祥
+祧
+票
+祭
+祯
+祲
+祷
+祸
+祹
+祺
+祼
+祾
+禀
+禁
+禄
+禅
+禇
+禊
+禋
+福
+禑
+禔
+禖
+禘
+禚
+禛
+禟
+禤
+禧
+禩
+禳
+禵
+禹
+禺
+离
+禽
+禾
+秀
+私
+秂
+秃
+秆
+秉
+秋
+种
+科
+秒
+秕
+秘
+租
+秣
+秤
+秦
+秧
+秩
+秫
+秬
+秭
+积
+称
+秸
+移
+秽
+秾
+稀
+稃
+程
+稍
+税
+稔
+稗
+稙
+稚
+稞
+稠
+稣
+稲
+稳
+稷
+稹
+稻
+稼
+稽
+稿
+穂
+穆
+穉
+穏
+穑
+穗
+穣
+穰
+穴
+究
+穷
+穹
+空
+穿
+突
+窃
+窄
+窅
+窆
+窈
+窊
+窋
+窍
+窑
+窒
+窓
+窕
+窖
+窗
+窘
+窜
+窝
+窟
+窠
+窣
+窥
+窦
+窨
+窭
+窰
+窳
+窸
+窿
+立
+竑
+竖
+站
+竜
+竝
+竞
+竟
+章
+竣
+童
+竦
+竭
+端
+竹
+竺
+竽
+竿
+笃
+笄
+笆
+笈
+笊
+笋
+笏
+笑
+笔
+笕
+笙
+笛
+笞
+笠
+笤
+笥
+符
+笨
+笪
+笫
+第
+笮
+笱
+笳
+笸
+笹
+笺
+笼
+笾
+筅
+筇
+等
+筊
+筋
+筌
+筏
+筐
+筑
+筒
+答
+策
+筘
+筚
+筛
+筜
+筝
+筠
+筭
+筮
+筯
+筱
+筲
+筴
+筵
+筷
+筹
+筼
+签
+简
+箅
+箌
+箍
+箐
+箓
+箔
+箕
+算
+箜
+箝
+管
+箦
+箧
+箨
+箩
+箪
+箫
+箬
+箭
+箱
+箴
+箸
+篁
+篆
+篇
+篌
+篑
+篓
+篙
+篚
+篝
+篡
+篥
+篦
+篪
+篮
+篯
+篱
+篷
+篼
+篾
+簃
+簇
+簋
+簌
+簏
+簕
+簖
+簟
+簠
+簦
+簧
+簪
+簰
+簸
+簿
+籀
+籁
+籍
+籓
+籙
+米
+籴
+籺
+类
+籼
+籽
+粄
+粉
+粑
+粒
+粕
+粗
+粘
+粜
+粝
+粞
+粟
+粢
+粤
+粥
+粦
+粧
+粪
+粬
+粮
+粱
+粲
+粳
+粹
+粼
+粽
+精
+粿
+糁
+糅
+糊
+糌
+糍
+糕
+糖
+糗
+糙
+糜
+糟
+糠
+糨
+糬
+糯
+糸
+系
+紊
+紘
+素
+索
+紧
+紫
+紬
+紮
+累
+経
+絜
+絪
+絮
+絵
+絶
+絷
+絺
+綎
+綖
+継
+続
+綝
+綦
+綫
+綮
+総
+緑
+緾
+縁
+縂
+縠
+縢
+縻
+繁
+繇
+繋
+繸
+繻
+纁
+纂
+纚
+纛
+纟
+纠
+纡
+红
+纣
+纤
+纥
+约
+级
+纨
+纩
+纪
+纫
+纬
+纭
+纮
+纯
+纰
+纱
+纲
+纳
+纵
+纶
+纷
+纸
+纹
+纺
+纻
+纽
+纾
+线
+绀
+绁
+绂
+练
+组
+绅
+细
+织
+终
+绉
+绊
+绌
+绍
+绎
+经
+绐
+绑
+绒
+结
+绔
+绕
+绗
+绘
+给
+绚
+绛
+络
+绝
+绞
+统
+绠
+绡
+绢
+绣
+绥
+绦
+继
+绨
+绩
+绪
+绫
+续
+绮
+绯
+绰
+绱
+绲
+绳
+维
+绵
+绶
+绷
+绸
+绹
+绺
+绻
+综
+绽
+绾
+绿
+缀
+缁
+缂
+缃
+缄
+缅
+缆
+缇
+缈
+缉
+缊
+缋
+缌
+缍
+缎
+缐
+缑
+缒
+缓
+缔
+缕
+编
+缗
+缘
+缙
+缚
+缛
+缜
+缝
+缟
+缠
+缡
+缢
+缣
+缤
+缥
+缦
+缧
+缨
+缩
+缪
+缫
+缬
+缭
+缮
+缯
+缰
+缱
+缲
+缳
+缴
+缵
+缶
+缷
+缸
+缺
+罂
+罃
+罄
+罅
+罍
+罐
+网
+罔
+罕
+罗
+罘
+罚
+罝
+罟
+罠
+罡
+罢
+罣
+罥
+罨
+罩
+罪
+置
+罱
+署
+罴
+罹
+罽
+罾
+羁
+羊
+羌
+美
+羑
+羔
+羕
+羚
+羝
+羞
+羟
+羡
+羣
+群
+羧
+羮
+羯
+羰
+羲
+羸
+羹
+羼
+羽
+羿
+翀
+翁
+翃
+翅
+翈
+翊
+翌
+翎
+翔
+翕
+翘
+翙
+翚
+翛
+翟
+翠
+翡
+翥
+翦
+翩
+翫
+翮
+翰
+翱
+翳
+翻
+翼
+翾
+耀
+老
+考
+耄
+者
+耆
+耋
+而
+耍
+耐
+耒
+耔
+耕
+耗
+耘
+耙
+耜
+耦
+耧
+耨
+耩
+耪
+耳
+耵
+耶
+耷
+耸
+耻
+耽
+耿
+聂
+聃
+聆
+聊
+聋
+职
+聍
+聒
+联
+聕
+聘
+聚
+聡
+聩
+聪
+聴
+聼
+聿
+肃
+肄
+肆
+肇
+肉
+肋
+肌
+肓
+肖
+肘
+肚
+肛
+肜
+肝
+肟
+肠
+股
+肢
+肤
+肥
+肩
+肪
+肫
+肮
+肯
+肱
+育
+肴
+肸
+肺
+肼
+肽
+肾
+肿
+胀
+胁
+胂
+胃
+胄
+胆
+背
+胍
+胎
+胖
+胗
+胙
+胚
+胛
+胜
+胝
+胞
+胠
+胡
+胤
+胥
+胧
+胨
+胪
+胫
+胬
+胭
+胯
+胰
+胱
+胳
+胴
+胶
+胸
+胺
+胼
+能
+脁
+脂
+脆
+脇
+脉
+脊
+脍
+脏
+脐
+脑
+脒
+脓
+脔
+脖
+脘
+脚
+脞
+脢
+脩
+脬
+脯
+脱
+脲
+脳
+脷
+脸
+脾
+脿
+腆
+腈
+腊
+腋
+腌
+腐
+腑
+腓
+腔
+腕
+腘
+腙
+腚
+腠
+腥
+腧
+腩
+腭
+腮
+腰
+腱
+腴
+腹
+腺
+腻
+腼
+腾
+腿
+膀
+膂
+膈
+膊
+膏
+膑
+膘
+膛
+膜
+膝
+膦
+膨
+膳
+膺
+膻
+臀
+臁
+臂
+臃
+臆
+臊
+臌
+臑
+臓
+臜
+臞
+臣
+臧
+自
+臬
+臭
+至
+致
+臵
+臻
+臼
+臾
+舀
+舁
+舂
+舄
+舅
+舆
+舌
+舍
+舎
+舐
+舒
+舔
+舖
+舘
+舛
+舜
+舞
+舟
+舡
+舢
+舣
+舨
+航
+舫
+般
+舯
+舰
+舱
+舲
+舳
+舴
+舵
+舶
+舷
+舸
+船
+舺
+舻
+舾
+艄
+艇
+艉
+艋
+艏
+艘
+艟
+艨
+艮
+良
+艰
+色
+艳
+艹
+艺
+艽
+艾
+艿
+节
+芃
+芄
+芈
+芊
+芋
+芍
+芎
+芏
+芑
+芒
+芗
+芘
+芙
+芜
+芝
+芟
+芡
+芣
+芤
+芥
+芦
+芨
+芩
+芪
+芫
+芬
+芭
+芮
+芯
+芰
+花
+芳
+芴
+芶
+芷
+芸
+芹
+芽
+芾
+苁
+苄
+苇
+苈
+苊
+苋
+苌
+苍
+苎
+苏
+苑
+苒
+苓
+苔
+苕
+苗
+苘
+苛
+苜
+苞
+苟
+苡
+苢
+苣
+苤
+若
+苦
+苫
+苭
+苯
+英
+苴
+苷
+苹
+苺
+苻
+苼
+苾
+茀
+茁
+茂
+范
+茄
+茅
+茆
+茇
+茈
+茉
+茌
+茎
+茏
+茑
+茔
+茕
+茗
+茚
+茛
+茜
+茝
+茧
+茨
+茫
+茬
+茭
+茯
+茱
+茳
+茴
+茵
+茶
+茸
+茹
+茺
+茼
+荀
+荃
+荄
+荅
+荆
+荇
+荈
+草
+荏
+荐
+荑
+荒
+荔
+荖
+荘
+荚
+荛
+荜
+荞
+荟
+荠
+荡
+荣
+荤
+荥
+荦
+荧
+荨
+荩
+荪
+荫
+荬
+荭
+荮
+药
+荳
+荷
+荸
+荻
+荼
+荽
+莃
+莅
+莆
+莉
+莎
+莒
+莓
+莘
+莙
+莛
+莜
+莞
+莠
+莨
+莩
+莪
+莫
+莱
+莲
+莳
+莴
+莶
+获
+莸
+莹
+莺
+莼
+莽
+菀
+菁
+菂
+菅
+菇
+菈
+菉
+菊
+菌
+菏
+菑
+菓
+菔
+菖
+菘
+菜
+菝
+菟
+菠
+菡
+菩
+菪
+菫
+菰
+菱
+菲
+菸
+菹
+菽
+菿
+萁
+萃
+萄
+萆
+萋
+萌
+萍
+萎
+萏
+萑
+萘
+萜
+萝
+萢
+萤
+营
+萦
+萧
+萨
+萩
+萱
+萸
+萼
+落
+葆
+葎
+葑
+葖
+著
+葙
+葚
+葛
+葜
+葡
+葢
+董
+葩
+葫
+葬
+葭
+葱
+葳
+葵
+葶
+葸
+葺
+蒂
+蒇
+蒉
+蒋
+蒌
+蒍
+蒎
+蒐
+蒗
+蒙
+蒜
+蒟
+蒡
+蒨
+蒯
+蒲
+蒴
+蒸
+蒹
+蒺
+蒻
+蒽
+蒾
+蒿
+蓁
+蓂
+蓄
+蓇
+蓉
+蓊
+蓍
+蓐
+蓑
+蓓
+蓖
+蓝
+蓟
+蓠
+蓢
+蓣
+蓥
+蓦
+蓬
+蓼
+蓿
+蔀
+蔌
+蔑
+蔓
+蔗
+蔘
+蔚
+蔟
+蔡
+蔫
+蔬
+蔴
+蔵
+蔷
+蔸
+蔹
+蔺
+蔻
+蔼
+蔽
+蕃
+蕅
+蕈
+蕉
+蕊
+蕖
+蕗
+蕙
+蕞
+蕡
+蕤
+蕨
+蕰
+蕲
+蕴
+蕹
+蕺
+蕻
+蕾
+薄
+薅
+薆
+薇
+薏
+薙
+薛
+薜
+薢
+薤
+薨
+薪
+薫
+薬
+薮
+薯
+薰
+薷
+薹
+藁
+藉
+藏
+藐
+藓
+藕
+藜
+藟
+藠
+藤
+藦
+藨
+藩
+藻
+藿
+蘅
+蘑
+蘖
+蘧
+蘩
+蘸
+蘼
+虎
+虏
+虐
+虑
+虒
+虓
+虔
+虚
+虞
+虢
+虫
+虬
+虮
+虱
+虹
+虺
+虻
+虽
+虾
+虿
+蚀
+蚁
+蚂
+蚊
+蚋
+蚌
+蚍
+蚓
+蚕
+蚖
+蚜
+蚝
+蚡
+蚣
+蚤
+蚧
+蚨
+蚩
+蚪
+蚬
+蚯
+蚰
+蚱
+蚴
+蚵
+蚶
+蚺
+蛀
+蛄
+蛆
+蛇
+蛉
+蛊
+蛋
+蛎
+蛏
+蛐
+蛔
+蛙
+蛛
+蛞
+蛟
+蛤
+蛩
+蛭
+蛮
+蛰
+蛱
+蛲
+蛳
+蛴
+蛸
+蛹
+蛾
+蜀
+蜂
+蜃
+蜇
+蜈
+蜉
+蜊
+蜍
+蜑
+蜒
+蜓
+蜕
+蜗
+蜘
+蜚
+蜜
+蜞
+蜡
+蜢
+蜣
+蜥
+蜩
+蜮
+蜱
+蜴
+蜷
+蜻
+蜾
+蜿
+蝇
+蝈
+蝉
+蝌
+蝎
+蝓
+蝗
+蝙
+蝠
+蝣
+蝤
+蝥
+蝮
+蝰
+蝲
+蝴
+蝶
+蝻
+蝼
+蝽
+蝾
+螂
+螃
+螅
+螈
+螋
+融
+螓
+螟
+螣
+螨
+螫
+螬
+螭
+螯
+螳
+螵
+螺
+螽
+蟀
+蟆
+蟊
+蟋
+蟌
+蟑
+蟒
+蟛
+蟜
+蟠
+蟥
+蟪
+蟮
+蟳
+蟹
+蟾
+蠃
+蠊
+蠋
+蠍
+蠓
+蠕
+蠖
+蠡
+蠢
+蠲
+蠹
+蠼
+血
+衄
+衅
+衆
+行
+衍
+衎
+衔
+街
+衙
+衞
+衡
+衢
+衣
+补
+表
+衩
+衪
+衫
+衬
+衮
+衰
+衲
+衷
+衽
+衾
+衿
+袁
+袂
+袄
+袅
+袆
+袈
+袋
+袍
+袒
+袓
+袖
+袛
+袜
+袢
+袤
+袪
+被
+袭
+袮
+袱
+袴
+袷
+袼
+裁
+裂
+装
+裆
+裈
+裉
+裒
+裔
+裕
+裘
+裙
+裛
+裟
+裡
+裢
+裤
+裥
+裨
+裪
+裱
+裳
+裴
+裸
+裹
+裼
+裾
+褀
+褂
+褆
+褊
+褐
+褒
+褓
+褔
+褙
+褚
+褛
+褡
+褥
+褪
+褫
+褰
+褴
+褶
+襁
+襄
+襌
+襕
+襜
+襞
+襟
+襦
+襻
+西
+要
+覃
+覆
+覇
+覚
+覧
+覩
+観
+见
+观
+规
+觅
+视
+觇
+览
+觉
+觊
+觋
+觌
+觎
+觏
+觐
+觑
+角
+觚
+觜
+觞
+解
+觥
+触
+觯
+觳
+觽
+觿
+言
+訇
+訏
+訚
+訫
+訳
+訾
+詈
+詜
+詝
+詧
+詹
+誉
+誊
+誐
+誓
+説
+読
+諡
+諲
+諴
+謇
+謦
+譞
+警
+譬
+譲
+讌
+讠
+计
+订
+讣
+认
+讥
+讦
+讧
+讨
+让
+讪
+讫
+讬
+训
+议
+讯
+记
+讲
+讳
+讴
+讵
+讶
+讷
+许
+讹
+论
+讼
+讽
+设
+访
+诀
+证
+诂
+诃
+评
+诅
+识
+诈
+诉
+诊
+诋
+诌
+词
+诎
+诏
+译
+诒
+诓
+诔
+试
+诖
+诗
+诘
+诙
+诚
+诛
+诜
+话
+诞
+诟
+诠
+诡
+询
+诣
+诤
+该
+详
+诧
+诨
+诩
+诫
+诬
+语
+诮
+误
+诰
+诱
+诲
+诳
+说
+诵
+诶
+请
+诸
+诹
+诺
+读
+诼
+诽
+课
+诿
+谀
+谁
+谂
+调
+谄
+谅
+谆
+谇
+谈
+谊
+谋
+谌
+谍
+谎
+谏
+谐
+谑
+谒
+谓
+谔
+谕
+谖
+谗
+谘
+谙
+谚
+谛
+谜
+谝
+谞
+谟
+谠
+谡
+谢
+谣
+谤
+谥
+谦
+谧
+谨
+谩
+谪
+谫
+谬
+谭
+谮
+谯
+谰
+谱
+谲
+谳
+谴
+谵
+谶
+谷
+谿
+豁
+豆
+豇
+豉
+豊
+豌
+豕
+豚
+象
+豢
+豨
+豪
+豫
+豳
+豸
+豹
+豺
+貂
+貅
+貉
+貊
+貌
+貐
+貔
+貘
+賨
+賸
+贇
+贝
+贞
+负
+贠
+贡
+财
+责
+贤
+败
+账
+货
+质
+贩
+贪
+贫
+贬
+购
+贮
+贯
+贰
+贱
+贲
+贳
+贴
+贵
+贶
+贷
+贸
+费
+贺
+贻
+贼
+贽
+贾
+贿
+赀
+赁
+赂
+赃
+资
+赅
+赈
+赉
+赊
+赋
+赌
+赍
+赎
+赏
+赐
+赑
+赓
+赔
+赕
+赖
+赘
+赙
+赚
+赛
+赜
+赝
+赞
+赟
+赠
+赡
+赢
+赣
+赤
+赦
+赧
+赪
+赫
+赭
+赮
+走
+赳
+赴
+赵
+赶
+起
+趁
+趄
+超
+越
+趋
+趔
+趟
+趣
+趯
+趱
+足
+趴
+趵
+趸
+趺
+趼
+趾
+趿
+跂
+跃
+跄
+跆
+跋
+跌
+跎
+跏
+跑
+跖
+跗
+跚
+跛
+距
+跞
+跟
+跢
+跣
+跤
+跨
+跩
+跪
+跫
+跬
+路
+跳
+践
+跶
+跷
+跸
+跹
+跺
+跻
+跽
+踅
+踉
+踊
+踌
+踏
+踔
+踝
+踞
+踟
+踢
+踣
+踩
+踪
+踬
+踮
+踯
+踰
+踱
+踵
+踹
+踽
+蹀
+蹁
+蹂
+蹄
+蹇
+蹈
+蹉
+蹊
+蹋
+蹑
+蹒
+蹓
+蹙
+蹚
+蹟
+蹦
+蹩
+蹬
+蹭
+蹰
+蹲
+蹴
+蹶
+蹻
+蹼
+蹿
+躁
+躄
+躅
+躇
+躏
+躐
+躔
+躜
+躞
+身
+躬
+躯
+躲
+躺
+転
+軽
+輋
+轘
+车
+轧
+轨
+轩
+轫
+转
+轭
+轮
+软
+轰
+轱
+轲
+轳
+轴
+轵
+轶
+轸
+轹
+轺
+轻
+轼
+载
+轾
+轿
+辂
+较
+辄
+辅
+辆
+辇
+辈
+辉
+辊
+辋
+辍
+辎
+辏
+辐
+辑
+输
+辔
+辕
+辖
+辗
+辘
+辙
+辚
+辛
+辜
+辞
+辟
+辣
+辨
+辩
+辫
+辰
+辱
+辶
+边
+辺
+辻
+込
+辽
+达
+辿
+迁
+迂
+迄
+迅
+过
+迈
+迍
+迎
+运
+近
+迓
+返
+迕
+还
+这
+进
+远
+违
+连
+迟
+迢
+迤
+迥
+迦
+迨
+迩
+迪
+迫
+迭
+迮
+述
+迳
+迷
+迸
+迹
+追
+退
+送
+适
+逃
+逄
+逅
+逆
+选
+逊
+逋
+逍
+透
+逐
+逑
+递
+途
+逖
+逗
+這
+通
+逛
+逝
+逞
+速
+造
+逡
+逢
+逦
+逭
+逮
+逯
+進
+逵
+逶
+逸
+逹
+逺
+逻
+逼
+逾
+遁
+遂
+遄
+遇
+遍
+遏
+遐
+遑
+遒
+道
+遗
+遘
+遛
+遢
+遣
+遥
+遨
+遭
+遮
+遯
+遴
+遵
+遶
+遹
+遽
+避
+邀
+邂
+邃
+邅
+邈
+邉
+邋
+邑
+邓
+邕
+邗
+邙
+邛
+邝
+邠
+邡
+邢
+那
+邦
+邨
+邪
+邬
+邮
+邯
+邰
+邱
+邳
+邴
+邵
+邶
+邸
+邹
+邺
+邻
+邽
+邾
+郁
+郄
+郅
+郇
+郈
+郊
+郎
+郏
+郐
+郑
+郓
+郕
+郗
+郚
+郛
+郜
+郝
+郞
+郡
+郢
+郤
+郦
+郧
+部
+郪
+郫
+郭
+郯
+郴
+郷
+郸
+都
+郾
+郿
+鄀
+鄂
+鄄
+鄗
+鄘
+鄙
+鄚
+鄜
+鄞
+鄠
+鄢
+鄣
+鄩
+鄫
+鄮
+鄯
+鄱
+鄹
+酂
+酃
+酆
+酉
+酊
+酋
+酌
+配
+酎
+酐
+酒
+酗
+酚
+酝
+酞
+酡
+酢
+酣
+酤
+酥
+酩
+酪
+酬
+酮
+酯
+酰
+酱
+酲
+酴
+酵
+酶
+酷
+酸
+酹
+酺
+酽
+酾
+酿
+醂
+醅
+醇
+醉
+醋
+醌
+醍
+醐
+醑
+醒
+醚
+醛
+醢
+醣
+醪
+醭
+醮
+醯
+醴
+醵
+醺
+醿
+釆
+采
+釉
+释
+里
+重
+野
+量
+金
+釜
+釭
+釿
+鈇
+鈈
+鈊
+鈎
+鈡
+鉄
+鉏
+鉨
+鉴
+鉷
+銎
+銙
+銭
+銮
+鋆
+鋈
+鋐
+鋗
+鋬
+鋹
+錞
+錡
+錤
+録
+錾
+鍀
+鍪
+鎌
+鎏
+鎚
+鏊
+鏐
+鏖
+鐏
+鑨
+鑫
+钅
+钆
+钇
+针
+钉
+钊
+钋
+钌
+钍
+钎
+钏
+钐
+钒
+钓
+钔
+钕
+钖
+钗
+钙
+钚
+钛
+钜
+钝
+钞
+钟
+钠
+钡
+钢
+钣
+钤
+钥
+钦
+钧
+钨
+钩
+钪
+钫
+钬
+钭
+钮
+钯
+钰
+钱
+钲
+钳
+钴
+钵
+钶
+钷
+钹
+钺
+钻
+钼
+钽
+钾
+钿
+铀
+铁
+铂
+铃
+铄
+铅
+铆
+铈
+铉
+铊
+铋
+铌
+铍
+铎
+铐
+铑
+铒
+铓
+铕
+铖
+铗
+铙
+铚
+铛
+铜
+铝
+铟
+铠
+铡
+铢
+铣
+铤
+铥
+铦
+铧
+铨
+铩
+铪
+铫
+铬
+铭
+铮
+铯
+铰
+铱
+铲
+铳
+铵
+银
+铷
+铸
+铺
+铼
+铽
+链
+铿
+销
+锁
+锂
+锃
+锄
+锅
+锆
+锇
+锈
+锉
+锋
+锌
+锍
+锎
+锏
+锐
+锑
+锒
+锔
+锕
+锖
+锗
+锘
+错
+锚
+锛
+锜
+锝
+锞
+锟
+锠
+锡
+锢
+锣
+锤
+锥
+锦
+锨
+锩
+锪
+锫
+锬
+锭
+键
+锯
+锰
+锱
+锲
+锳
+锴
+锵
+锶
+锷
+锸
+锹
+锺
+锻
+锼
+锽
+镀
+镁
+镂
+镄
+镅
+镆
+镇
+镈
+镉
+镊
+镋
+镌
+镍
+镎
+镏
+镐
+镑
+镒
+镓
+镔
+镕
+镖
+镗
+镘
+镙
+镚
+镛
+镜
+镝
+镞
+镟
+镠
+镡
+镢
+镣
+镥
+镦
+镧
+镨
+镪
+镫
+镬
+镭
+镯
+镰
+镱
+镲
+镳
+镶
+长
+開
+閟
+関
+閦
+闇
+闍
+闘
+门
+闩
+闪
+闫
+闭
+问
+闯
+闰
+闱
+闲
+闳
+间
+闵
+闶
+闷
+闸
+闹
+闺
+闻
+闼
+闽
+闾
+闿
+阀
+阁
+阂
+阃
+阄
+阅
+阆
+阇
+阈
+阉
+阊
+阋
+阌
+阍
+阎
+阏
+阐
+阑
+阒
+阓
+阔
+阕
+阖
+阗
+阙
+阚
+阛
+阜
+阝
+队
+阡
+阪
+阮
+阱
+防
+阳
+阴
+阵
+阶
+阻
+阼
+阿
+陀
+陂
+附
+际
+陆
+陇
+陈
+陉
+陋
+陌
+降
+限
+陔
+陕
+陛
+陞
+陟
+陡
+院
+除
+陨
+险
+陪
+陬
+陲
+陵
+陶
+陷
+険
+隂
+隅
+隆
+隈
+隋
+隍
+随
+隐
+隔
+隗
+隘
+隙
+障
+隠
+隣
+隧
+隰
+隳
+隶
+隹
+隼
+隽
+难
+雀
+雁
+雄
+雅
+集
+雇
+雉
+雊
+雌
+雍
+雎
+雏
+雑
+雒
+雕
+雠
+雨
+雩
+雪
+雫
+雯
+雱
+雳
+零
+雷
+雹
+雾
+需
+霁
+霂
+霄
+霅
+霆
+震
+霈
+霉
+霊
+霍
+霎
+霏
+霓
+霖
+霙
+霜
+霞
+霪
+霭
+霰
+露
+霸
+霹
+霾
+靑
+青
+靓
+靖
+静
+靛
+非
+靠
+靡
+面
+靥
+革
+靬
+靳
+靴
+靶
+靺
+靼
+鞅
+鞋
+鞍
+鞑
+鞔
+鞘
+鞞
+鞠
+鞣
+鞥
+鞨
+鞫
+鞬
+鞭
+鞮
+鞯
+鞴
+韘
+韡
+韦
+韧
+韩
+韪
+韫
+韬
+韭
+音
+韵
+韶
+頔
+頞
+頠
+頫
+頵
+頼
+顒
+顔
+顕
+顗
+页
+顶
+顷
+顸
+项
+顺
+须
+顼
+顽
+顾
+顿
+颀
+颁
+颂
+颃
+预
+颅
+领
+颇
+颈
+颉
+颊
+颋
+颌
+颍
+颎
+颏
+颐
+频
+颓
+颔
+颖
+颗
+题
+颙
+颚
+颛
+颜
+额
+颞
+颟
+颠
+颡
+颢
+颤
+颦
+颧
+风
+飏
+飐
+飑
+飒
+飓
+飕
+飖
+飘
+飙
+飚
+飞
+食
+飧
+飨
+餍
+餐
+餗
+餮
+饔
+饕
+饣
+饥
+饦
+饧
+饨
+饩
+饪
+饫
+饬
+饭
+饮
+饯
+饰
+饱
+饲
+饴
+饵
+饶
+饷
+饸
+饹
+饺
+饼
+饽
+饿
+馀
+馁
+馃
+馄
+馅
+馆
+馇
+馈
+馊
+馋
+馍
+馏
+馐
+馑
+馒
+馓
+馔
+馕
+首
+馗
+馘
+香
+馥
+馨
+駄
+駅
+駆
+騄
+騑
+騒
+験
+驎
+驒
+驩
+马
+驭
+驮
+驯
+驰
+驱
+驳
+驴
+驶
+驷
+驸
+驹
+驺
+驻
+驼
+驽
+驾
+驿
+骀
+骁
+骂
+骃
+骄
+骅
+骆
+骇
+骈
+骉
+骊
+骋
+验
+骍
+骎
+骏
+骐
+骑
+骓
+骕
+骖
+骗
+骘
+骙
+骚
+骛
+骜
+骝
+骞
+骟
+骠
+骡
+骢
+骤
+骥
+骧
+骨
+骰
+骱
+骶
+骷
+骸
+骺
+骼
+髀
+髁
+髂
+髃
+髅
+髈
+髋
+髌
+髎
+髑
+髓
+高
+髙
+髡
+髦
+髪
+髫
+髭
+髯
+髹
+髻
+鬃
+鬈
+鬐
+鬓
+鬘
+鬟
+鬣
+鬯
+鬲
+鬶
+鬻
+鬼
+魁
+魂
+魃
+魄
+魅
+魆
+魇
+魈
+魉
+魋
+魍
+魏
+魑
+魔
+魟
+鮀
+鮈
+鮋
+鮟
+鮠
+鮨
+鮰
+鰕
+鰤
+鱀
+鱇
+鱓
+鱬
+鱲
+鱻
+鱼
+鱿
+鲀
+鲁
+鲂
+鲃
+鲅
+鲆
+鲇
+鲈
+鲉
+鲊
+鲋
+鲌
+鲍
+鲎
+鲏
+鲐
+鲑
+鲔
+鲕
+鲗
+鲘
+鲙
+鲚
+鲛
+鲜
+鲞
+鲟
+鲠
+鲡
+鲢
+鲣
+鲤
+鲥
+鲦
+鲧
+鲨
+鲩
+鲫
+鲭
+鲮
+鲱
+鲲
+鲳
+鲴
+鲵
+鲶
+鲷
+鲸
+鲹
+鲺
+鲻
+鲼
+鲽
+鲿
+鳀
+鳃
+鳄
+鳅
+鳇
+鳉
+鳊
+鳌
+鳍
+鳎
+鳏
+鳐
+鳑
+鳓
+鳔
+鳕
+鳖
+鳗
+鳙
+鳚
+鳜
+鳝
+鳞
+鳟
+鳡
+鳢
+鳣
+鳯
+鳽
+鳾
+鴂
+鴞
+鴷
+鵖
+鵙
+鵟
+鵺
+鶒
+鶲
+鷇
+鷉
+鷟
+鸂
+鸊
+鸑
+鸟
+鸠
+鸡
+鸢
+鸣
+鸥
+鸦
+鸨
+鸩
+鸪
+鸫
+鸬
+鸭
+鸮
+鸯
+鸰
+鸱
+鸲
+鸳
+鸵
+鸶
+鸷
+鸸
+鸹
+鸺
+鸻
+鸽
+鸾
+鸿
+鹀
+鹁
+鹂
+鹃
+鹄
+鹅
+鹆
+鹇
+鹈
+鹉
+鹊
+鹋
+鹌
+鹍
+鹎
+鹏
+鹑
+鹓
+鹕
+鹖
+鹗
+鹘
+鹚
+鹛
+鹜
+鹞
+鹟
+鹠
+鹡
+鹢
+鹣
+鹤
+鹦
+鹧
+鹨
+鹩
+鹪
+鹫
+鹬
+鹭
+鹮
+鹰
+鹱
+鹳
+鹾
+鹿
+麂
+麇
+麈
+麋
+麐
+麑
+麒
+麓
+麝
+麟
+麤
+麦
+麴
+麸
+麹
+麻
+麽
+麾
+麿
+黄
+黉
+黍
+黎
+黏
+黐
+黑
+黒
+黔
+默
+黙
+黛
+黜
+黝
+黟
+黠
+黡
+黢
+黥
+黧
+黩
+黯
+黻
+黼
+黾
+鼋
+鼍
+鼎
+鼐
+鼓
+鼗
+鼙
+鼠
+鼢
+鼩
+鼬
+鼯
+鼱
+鼷
+鼹
+鼻
+鼽
+鼾
+齁
+齐
+齑
+齢
+齮
+齿
+龀
+龁
+龃
+龄
+龅
+龆
+龇
+龈
+龉
+龊
+龋
+龌
+龑
+龘
+龙
+龚
+龛
+龟
+龠
+龢
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+AA
+AB
+AC
+AD
+AE
+AF
+AG
+AH
+AI
+AJ
+AK
+AL
+AM
+AN
+AP
+AQ
+AR
+AS
+AT
+AU
+AV
+AW
+AX
+AZ
+Al
+An
+Au
+Aw
+BA
+BB
+BC
+BD
+BE
+BF
+BG
+BH
+BI
+BJ
+BK
+BL
+BM
+BN
+BO
+BP
+BQ
+BR
+BS
+BT
+BU
+BV
+BW
+BY
+Bo
+Br
+Bu
+CA
+CB
+CC
+CD
+CE
+CF
+CG
+CH
+CI
+CJ
+CK
+CL
+CM
+CN
+CO
+CP
+CQ
+CR
+CS
+CT
+CU
+CV
+CW
+CX
+CY
+CZ
+Ca
+Ch
+Cl
+Co
+Cu
+DA
+DB
+DC
+DD
+DE
+DF
+DG
+DH
+DI
+DJ
+DK
+DL
+DM
+DN
+DO
+DQ
+DR
+DS
+DT
+DV
+DW
+DX
+DY
+DZ
+Da
+De
+Di
+Do
+Dr
+Du
+EA
+EB
+EC
+ED
+EE
+EF
+EG
+EH
+EI
+EK
+EL
+EM
+EN
+EP
+EQ
+ER
+ES
+ET
+EU
+EV
+EW
+EX
+EZ
+Ed
+En
+Ev
+Ex
+FA
+FB
+FC
+FD
+FE
+FF
+FG
+FH
+FI
+FJ
+FL
+FM
+FN
+FO
+FP
+FR
+FS
+FT
+FU
+FW
+FX
+FY
+FZ
+Fa
+Fi
+Fl
+Fo
+Fr
+Fu
+GA
+GB
+GC
+GD
+GE
+GF
+GG
+GH
+GI
+GJ
+GK
+GL
+GM
+GN
+GO
+GP
+GQ
+GR
+GS
+GT
+GU
+GW
+GX
+GY
+GZ
+Ga
+Go
+Gr
+Gu
+HA
+HB
+HC
+HD
+HE
+HF
+HG
+HH
+HI
+HJ
+HK
+HL
+HO
+HP
+HQ
+HR
+HS
+HT
+HU
+HV
+HW
+HX
+HY
+HZ
+Ha
+He
+Hi
+Ho
+Hu
+Hz
+IB
+IC
+ID
+IE
+IF
+IG
+IH
+II
+IK
+IL
+IM
+IN
+IO
+IP
+IQ
+IR
+IS
+IT
+IU
+IV
+IX
+If
+In
+JA
+JB
+JC
+JD
+JF
+JG
+JH
+JI
+JJ
+JK
+JL
+JM
+JO
+JP
+JQ
+JR
+JS
+JT
+JU
+JW
+JX
+JY
+JZ
+Ja
+Ji
+Jo
+Ju
+KA
+KB
+KC
+KD
+KE
+KF
+KG
+KH
+KI
+KJ
+KK
+KL
+KM
+KN
+KO
+KP
+KR
+KS
+KT
+KV
+KW
+KX
+KY
+KZ
+LA
+LB
+LC
+LD
+LE
+LF
+LG
+LH
+LI
+LJ
+LK
+LL
+LM
+LN
+LO
+LP
+LQ
+LR
+LS
+LT
+LU
+LV
+LW
+LX
+LY
+LZ
+La
+Le
+Li
+Lo
+Lu
+MA
+MB
+MC
+MD
+ME
+MF
+MG
+MH
+MI
+MJ
+MK
+ML
+MM
+MN
+MO
+MP
+MQ
+MR
+MS
+MT
+MU
+MV
+MW
+MX
+MY
+Ma
+Me
+Mi
+Mo
+Mu
+My
+NA
+NB
+NC
+ND
+NE
+NF
+NG
+NH
+NI
+NJ
+NK
+NL
+NN
+NO
+NP
+NR
+NS
+NT
+NU
+NV
+NW
+NX
+NY
+NZ
+Na
+Ne
+No
+Nu
+OA
+OB
+OC
+OD
+OE
+OF
+OG
+OH
+OK
+OL
+OM
+ON
+OO
+OP
+OR
+OS
+OT
+OU
+OV
+OZ
+Of
+Oh
+On
+Op
+Or
+Ou
+Ox
+PA
+PB
+PC
+PD
+PE
+PF
+PG
+PH
+PI
+PJ
+PK
+PL
+PM
+PN
+PO
+PP
+PQ
+PR
+PS
+PT
+PU
+PV
+PW
+PX
+Pa
+Ph
+Pl
+Po
+Pr
+Pu
+QA
+QB
+QC
+QE
+QF
+QG
+QJ
+QL
+QQ
+QR
+QS
+QT
+QU
+QW
+QY
+Qi
+Qu
+RA
+RB
+RC
+RE
+RF
+RG
+RH
+RI
+RJ
+RK
+RL
+RM
+RN
+RO
+RP
+RQ
+RR
+RS
+RT
+RV
+RW
+RX
+RZ
+Ra
+Re
+Ro
+Ru
+SA
+SB
+SC
+SD
+SE
+SF
+SG
+SH
+SI
+SJ
+SK
+SL
+SM
+SN
+SO
+SP
+SQ
+SR
+SS
+ST
+SU
+SV
+SW
+SX
+SY
+SZ
+Sc
+Sh
+So
+Sp
+St
+Su
+Sw
+Sy
+TA
+TB
+TC
+TD
+TE
+TF
+TG
+TH
+TI
+TJ
+TK
+TL
+TM
+TN
+TO
+TP
+TQ
+TR
+TS
+TT
+TU
+TV
+TW
+TX
+TY
+TZ
+Th
+To
+Tr
+Tw
+UA
+UC
+UD
+UE
+UF
+UG
+UH
+UI
+UK
+UL
+UM
+UN
+UP
+UR
+US
+UT
+UU
+UV
+UW
+UX
+Ub
+Un
+Up
+VA
+VB
+VC
+VE
+VF
+VG
+VH
+VI
+VJ
+VK
+VL
+VM
+VN
+VO
+VP
+VR
+VS
+VT
+VU
+VV
+VX
+Vi
+Vo
+WA
+WB
+WC
+WE
+WH
+WI
+WJ
+WL
+WM
+WN
+WO
+WQ
+WR
+WS
+WT
+WU
+WW
+WX
+WZ
+Wa
+We
+Wi
+Wo
+Wu
+XB
+XC
+XD
+XF
+XG
+XH
+XI
+XJ
+XK
+XL
+XM
+XO
+XP
+XQ
+XR
+XS
+XT
+XU
+XV
+XW
+XX
+XY
+XZ
+Xi
+Xu
+YA
+YB
+YC
+YD
+YE
+YF
+YG
+YH
+YJ
+YL
+YM
+YO
+YP
+YS
+YT
+YU
+YX
+YY
+YZ
+Ya
+Yo
+Yu
+ZA
+ZB
+ZC
+ZD
+ZE
+ZF
+ZG
+ZH
+ZI
+ZJ
+ZL
+ZM
+ZN
+ZO
+ZQ
+ZR
+ZS
+ZU
+ZW
+ZX
+ZY
+ZZ
+Zh
+ab
+aj
+an
+ap
+ar
+bb
+be
+bj
+bo
+bu
+by
+ca
+cb
+cf
+ch
+cl
+cm
+co
+cp
+cv
+dB
+da
+de
+di
+dj
+dn
+do
+dr
+dv
+ed
+em
+en
+ep
+eq
+ev
+ex
+ez
+fa
+fe
+ff
+fi
+fl
+fo
+fr
+fu
+gb
+gd
+gh
+gi
+go
+gp
+gr
+gu
+gz
+ha
+he
+hi
+ho
+hp
+hz
+iP
+iT
+ib
+ic
+id
+if
+ig
+im
+in
+io
+ip
+iq
+is
+it
+jQ
+ja
+ji
+jj
+jo
+jq
+ju
+kJ
+kN
+kW
+kg
+kn
+kz
+la
+ld
+le
+lg
+li
+ll
+lo
+lp
+lz
+ma
+mb
+me
+mi
+mm
+mo
+mp
+mq
+mu
+mv
+my
+na
+nb
+ng
+no
+nv
+ob
+of
+oh
+ok
+ol
+on
+op
+or
+ou
+ow
+oz
+pH
+pa
+pc
+ph
+pk
+pl
+po
+pp
+pr
+pu
+pv
+qf
+qq
+qu
+qz
+ra
+re
+rn
+ro
+rq
+se
+sh
+sk
+so
+sp
+sq
+st
+su
+sw
+sz
+th
+ti
+to
+tr
+tv
+tw
+ub
+uc
+uf
+ui
+uk
+un
+up
+us
+uv
+ux
+uz
+vc
+vi
+vo
+vr
+wa
+we
+wh
+wi
+wo
+wr
+ww
+xj
+xq
+xx
+ya
+ye
+yj
+yo
+yu
+yy
+yz
+zf
+zh
+zi
+zj
+zq
+zu
+zz
+AAA
+AAC
+ABA
+ABB
+ABC
+ABO
+ABS
+ABT
+ACA
+ACC
+ACD
+ACE
+ACG
+ACK
+ACL
+ACM
+ACP
+ACR
+ACS
+ACT
+ADA
+ADC
+ADD
+ADF
+ADI
+ADO
+ADP
+ADR
+ADS
+ADV
+AED
+AES
+AFC
+AFP
+AFS
+AGB
+AGC
+AGE
+AGM
+AGP
+AGV
+AIA
+AIC
+AIG
+AIM
+AIP
+AIR
+AIS
+AIX
+AKB
+AKM
+ALA
+ALL
+ALT
+AMA
+AMC
+AMD
+AMG
+AMI
+AML
+AMP
+AMR
+AMS
+AMT
+AMX
+AND
+AOC
+AOE
+AOL
+APA
+APC
+APE
+APG
+API
+APK
+APL
+APM
+APP
+APS
+APT
+APU
+ARA
+ARC
+ARE
+ARM
+ARP
+ART
+ASA
+ASC
+ASF
+ASM
+ASP
+ASR
+AST
+ATA
+ATC
+ATF
+ATI
+ATK
+ATM
+ATP
+ATS
+ATV
+ATX
+AUC
+AUG
+AUX
+AVC
+AVG
+AVI
+AVR
+AVS
+AVX
+AWM
+AWS
+All
+And
+Ang
+App
+Aqu
+BAC
+BAD
+BAE
+BAR
+BAT
+BAU
+BBA
+BBB
+BBC
+BBE
+BBQ
+BBS
+BBT
+BCD
+BEA
+BEC
+BEI
+BET
+BGA
+BGM
+BGP
+BIG
+BIM
+BIS
+BLG
+BMC
+BMD
+BMG
+BMI
+BMP
+BMW
+BMX
+BNC
+BOD
+BOM
+BOT
+BOX
+BOY
+BPM
+BPO
+BRN
+BRT
+BSA
+BSC
+BSD
+BSI
+BSM
+BSP
+BSS
+BTC
+BTR
+BTS
+BTV
+BUG
+BUN
+BUS
+BWV
+Bur
+Bus
+But
+CAA
+CAC
+CAD
+CAE
+CAI
+CAJ
+CAM
+CAN
+CAP
+CAR
+CAS
+CAT
+CBA
+CBC
+CBD
+CBN
+CBR
+CBS
+CCA
+CCC
+CCD
+CCF
+CCG
+CCI
+CCK
+CCM
+CCN
+CCP
+CCS
+CDC
+CDM
+CDN
+CDO
+CDP
+CDR
+CDS
+CEA
+CEC
+CEO
+CES
+CET
+CFA
+CFC
+CFD
+CFO
+CFR
+CGI
+CHA
+CHM
+CHO
+CIA
+CIC
+CID
+CIE
+CIF
+CIK
+CIO
+CIP
+CIS
+CLA
+CLI
+CLM
+CLS
+CMA
+CMC
+CME
+CML
+CMM
+CMO
+CMP
+CMS
+CMV
+CNC
+CNG
+CNN
+CNS
+COB
+COC
+COD
+COM
+CON
+COO
+COP
+COS
+COX
+CPA
+CPC
+CPE
+CPI
+CPL
+CPM
+CPP
+CPR
+CPS
+CPU
+CQC
+CRC
+CRM
+CRP
+CRS
+CRT
+CSA
+CSF
+CSI
+CSM
+CSP
+CSR
+CSS
+CST
+CTA
+CTC
+CTI
+CTO
+CTP
+CTS
+CUB
+CUT
+CVD
+CVN
+CVS
+CVT
+CXW
+CYP
+Car
+Cha
+Chr
+Chu
+Com
+Con
+Cou
+Cur
+DAB
+DAC
+DAO
+DAS
+DAT
+DAY
+DBA
+DBM
+DCD
+DCE
+DCF
+DCS
+DCT
+DDC
+DDD
+DDG
+DDN
+DDR
+DDS
+DDT
+DEA
+DEC
+DEM
+DES
+DFS
+DFT
+DHA
+DHL
+DIC
+DID
+DIF
+DIN
+DIP
+DIV
+DIY
+DLC
+DLL
+DLP
+DLT
+DMA
+DMC
+DMD
+DMF
+DMI
+DMO
+DMZ
+DNA
+DNF
+DNS
+DNV
+DOC
+DOI
+DOM
+DON
+DOS
+DOT
+DPI
+DPP
+DPS
+DRM
+DRX
+DSA
+DSC
+DSG
+DSL
+DSM
+DSP
+DSS
+DTC
+DTE
+DTM
+DTS
+DTU
+DVB
+DVD
+DVI
+DVR
+DWG
+DYG
+Day
+Div
+Don
+Dou
+Dow
+EAN
+EAP
+EBD
+EBS
+ECC
+ECM
+ECO
+ECT
+ECU
+ECW
+EDA
+EDG
+EDI
+EDM
+EDP
+EDR
+EEG
+EEP
+EFR
+EGF
+EHS
+EIA
+EJB
+EMA
+EMC
+EMI
+EMP
+EMS
+END
+EOS
+EPA
+EPC
+EPO
+EPR
+EPS
+ERP
+ESC
+ESD
+ESI
+ESL
+ESP
+ESR
+EST
+ETC
+ETF
+ETH
+ETL
+ETS
+EVA
+EVE
+EVO
+EXE
+EXO
+EXP
+EYE
+Eff
+Ell
+Emb
+Emp
+End
+Eng
+Equ
+Eur
+Eva
+Exc
+Exp
+FAA
+FAB
+FAG
+FAL
+FAN
+FAO
+FAQ
+FAT
+FBI
+FCA
+FCC
+FCI
+FCS
+FDA
+FDD
+FDI
+FEM
+FES
+FET
+FFT
+FGO
+FHD
+FIA
+FLV
+FLY
+FMS
+FNC
+FOB
+FOF
+FOR
+FOX
+FPC
+FPS
+FPX
+FRP
+FSA
+FSB
+FSC
+FSH
+FTA
+FTC
+FTP
+FUE
+FUN
+Fin
+Fiv
+Fly
+For
+Fou
+Fuj
+Fun
+Fut
+GAP
+GAT
+GAY
+GBA
+GBK
+GBT
+GBU
+GCC
+GCS
+GCT
+GDI
+GDP
+GEN
+GEO
+GET
+GFP
+GHz
+GIA
+GIF
+GIS
+GLA
+GLC
+GLP
+GLS
+GMA
+GMC
+GMP
+GMS
+GMT
+GMV
+GND
+GNP
+GNU
+GOD
+GOT
+GPA
+GPL
+GPS
+GPT
+GPU
+GRC
+GRE
+GRF
+GSH
+GSM
+GSP
+GTA
+GTI
+GTO
+GTP
+GTR
+GTS
+GTX
+GUI
+Giv
+Gmb
+Gua
+Gui
+Gun
+Guo
+Guy
+HAD
+HAL
+HBA
+HBO
+HBV
+HBs
+HCG
+HCI
+HCV
+HCl
+HDD
+HDL
+HDR
+HDV
+HEY
+HFC
+HGH
+HGT
+HID
+HIP
+HIS
+HIT
+HIV
+HLA
+HMG
+HMI
+HMS
+HOP
+HOT
+HOW
+HPC
+HPV
+HRC
+HRT
+HSE
+HSK
+HSV
+HTC
+HUB
+HUD
+HVG
+Haz
+Her
+Hom
+Hon
+Hou
+How
+Hua
+Hub
+Hum
+Hun
+IAI
+IAS
+IAT
+IBC
+IBF
+IBM
+ICA
+ICC
+ICD
+ICE
+ICO
+ICP
+ICQ
+ICS
+ICT
+ICU
+IDC
+IDD
+IDE
+IDF
+IDG
+IDS
+IEC
+IET
+IFA
+IFC
+IFI
+IFN
+IGF
+IGN
+IIA
+III
+IIS
+IKO
+IMA
+IMC
+IMD
+IME
+IMF
+IMG
+IMO
+IMS
+IMT
+INA
+INC
+INF
+ING
+INS
+INT
+IOS
+IPA
+IPC
+IPO
+IPS
+IPX
+IRC
+IRI
+ISA
+ISI
+ISM
+ISO
+ISP
+ITC
+ITF
+ITO
+ITS
+ITT
+ITU
+ITV
+IVR
+Imp
+InC
+Inf
+Inj
+Int
+JAR
+JBL
+JBT
+JCB
+JCR
+JDB
+JDG
+JET
+JGJ
+JIS
+JIT
+JKL
+JOE
+JPG
+JSF
+JSP
+JST
+JTA
+JVC
+JVM
+JYJ
+JYP
+Jac
+Jam
+Jan
+Jap
+Jav
+Jay
+Jin
+Joh
+Jon
+Jul
+Jun
+Jus
+KAB
+KAT
+KBS
+KDF
+KDJ
+KEY
+KFC
+KFR
+KID
+KIS
+KJm
+KOF
+KOH
+KOL
+KPI
+KPL
+KTV
+KVM
+Kin
+Kon
+Kur
+LAB
+LAN
+LBS
+LCA
+LCD
+LCK
+LCS
+LDA
+LDH
+LDL
+LDP
+LED
+LEE
+LEO
+LES
+LET
+LGA
+LGD
+LIN
+LIU
+LLC
+LME
+LMS
+LNG
+LOF
+LOL
+LOW
+LPG
+LPL
+LPR
+LRC
+LSA
+LSD
+LSI
+LSP
+LTD
+LTE
+LUC
+LUN
+LVM
+Laz
+Lib
+Lif
+Lin
+Liu
+Liz
+Lon
+Lou
+Low
+Luc
+Lum
+Luo
+Lux
+MAC
+MAD
+MAG
+MAN
+MAO
+MAP
+MAR
+MAS
+MAT
+MAX
+MAY
+MBA
+MBC
+MBO
+MBR
+MBS
+MCA
+MCC
+MCM
+MCN
+MCP
+MCS
+MCU
+MDA
+MDI
+MDL
+MDR
+MDS
+MEN
+MES
+MFA
+MFC
+MHC
+MHz
+MIB
+MIC
+MID
+MIL
+MIN
+MIS
+MIT
+MIX
+MKV
+MLC
+MLF
+MMA
+MMC
+MMI
+MMO
+MMS
+MMX
+MOD
+MOM
+MOS
+MOV
+MPA
+MPC
+MPG
+MPI
+MPS
+MPV
+MPa
+MRC
+MRI
+MRO
+MRP
+MSA
+MSC
+MSI
+MSN
+MTI
+MTK
+MTS
+MTU
+MTV
+MVC
+MVP
+Mac
+Mag
+Maj
+Man
+Mar
+Max
+May
+Mic
+Min
+Mon
+Mou
+Mur
+NAD
+NAS
+NAT
+NBA
+NBC
+NBL
+NCT
+NDS
+NEC
+NEO
+NES
+NET
+NEW
+NEX
+NFA
+NFC
+NFL
+NFS
+NGC
+NGN
+NGO
+NHK
+NHL
+NIC
+NIH
+NLP
+NME
+NMR
+NOT
+NOW
+NOX
+NOx
+NPC
+NPN
+NPR
+NSA
+NSC
+NSF
+NSK
+NTN
+NTP
+NTT
+NTV
+NVH
+NWA
+NXT
+NYT
+Nic
+Nob
+Nor
+Nov
+Now
+Nur
+OAD
+OBD
+OCG
+OCP
+OCR
+OCT
+ODM
+OEM
+OFF
+OGG
+OLE
+OMG
+ONE
+ONU
+OOO
+OPC
+OPP
+ORC
+OSD
+OSI
+OSS
+OST
+OTA
+OTC
+OTG
+OTT
+OUT
+OVA
+OVP
+Obj
+Off
+Oly
+Ope
+Oph
+Opt
+Our
+Out
+Ove
+PAC
+PAD
+PAH
+PAL
+PAM
+PAN
+PAS
+PBS
+PBT
+PCA
+PCB
+PCD
+PCI
+PCL
+PCM
+PCR
+PCS
+PCT
+PDA
+PDB
+PDC
+PDD
+PDF
+PDM
+PDP
+PDU
+PEG
+PEP
+PER
+PES
+PET
+PFA
+PFC
+PGA
+PGC
+PHP
+PHS
+PIC
+PID
+PIM
+PIN
+PKI
+PLA
+PLC
+PLD
+PLL
+PLM
+PMC
+PMI
+PMP
+PND
+PNG
+PNP
+POE
+POM
+PON
+POP
+POS
+PPA
+PPC
+PPG
+PPH
+PPI
+PPM
+PPP
+PPR
+PPS
+PPT
+PPV
+PRL
+PRO
+PSA
+PSD
+PSE
+PSG
+PSI
+PSK
+PSP
+PSS
+PSV
+PSW
+PSY
+PTA
+PTC
+PTH
+PTT
+PUB
+PVA
+PVC
+PVE
+PVP
+PWM
+Par
+Per
+Pic
+Pow
+Pro
+Pur
+QAM
+QDI
+QFP
+QGh
+QOS
+QPI
+QPS
+QRS
+QTL
+Qin
+Qua
+Que
+RAM
+RAP
+RAR
+RAS
+RAW
+RBC
+RCA
+RCS
+RDF
+RDS
+RED
+REF
+REG
+REM
+REX
+RFC
+RGB
+RIA
+RIM
+RIP
+RMB
+RMS
+RNA
+RNG
+ROC
+ROE
+ROI
+ROM
+RPC
+RPG
+RPM
+RRW
+RSA
+RSC
+RSI
+RSS
+RTA
+RTC
+RTK
+RTP
+RTS
+RTU
+RTX
+RUN
+RUS
+Ray
+Raz
+Ric
+Riv
+Rom
+Rou
+Rub
+Run
+Rus
+SAC
+SAE
+SAM
+SAN
+SAO
+SAP
+SAR
+SAS
+SAT
+SAY
+SBR
+SBS
+SCE
+SCH
+SCI
+SCM
+SCP
+SCR
+SDH
+SDI
+SDK
+SDR
+SDS
+SEA
+SEC
+SEE
+SEM
+SEO
+SER
+SET
+SFC
+SFP
+SGH
+SGI
+SGS
+SHA
+SHE
+SID
+SIG
+SIM
+SIP
+SIR
+SIS
+SKF
+SKT
+SKU
+SKY
+SLA
+SLC
+SLE
+SLG
+SLI
+SLR
+SLS
+SMA
+SMB
+SMC
+SMD
+SMG
+SMI
+SMP
+SMS
+SMT
+SNK
+SNP
+SNR
+SNS
+SOA
+SOC
+SOD
+SOI
+SOP
+SOS
+SPA
+SPC
+SPD
+SPE
+SPF
+SPI
+SPR
+SPS
+SPT
+SPV
+SQL
+SQU
+SRS
+SRT
+SSA
+SSC
+SSD
+SSE
+SSH
+SSL
+SSR
+SSS
+SST
+STC
+STD
+STK
+STL
+STM
+STN
+STP
+STR
+STS
+SUB
+SUN
+SUV
+SVC
+SVD
+SVG
+SVM
+SWF
+SXG
+SYN
+SYS
+Sch
+Ser
+She
+Siz
+Som
+Sou
+Squ
+Sub
+Sum
+Sun
+Sup
+Suz
+TAB
+TAC
+TAG
+TAO
+TBC
+TBM
+TBS
+TCG
+TCL
+TCM
+TCO
+TCP
+TCR
+TCS
+TCT
+TDD
+TDI
+TDM
+TDP
+TDS
+TEC
+TED
+TEL
+TEM
+TES
+TEU
+TEX
+TFT
+TGA
+TGF
+TGV
+THD
+THE
+TIA
+TIF
+TKO
+TLC
+TLS
+TMD
+TMP
+TMS
+TMT
+TNA
+TNF
+TNT
+TOC
+TOD
+TOE
+TOM
+TOP
+TPC
+TPE
+TPM
+TPO
+TPP
+TPR
+TPS
+TPU
+TQM
+TSC
+TSH
+TSI
+TSP
+TTL
+TTS
+TTT
+TUV
+TVB
+TVC
+TVP
+TVS
+TWO
+TXT
+Tay
+The
+Tom
+Tou
+Tow
+Tur
+UAR
+UBC
+UCC
+UCL
+UDP
+UFC
+UFO
+UGC
+UHF
+UIP
+UMD
+UML
+UNI
+UPC
+UPS
+URL
+USA
+USB
+USD
+USM
+USP
+USS
+UTC
+UTF
+UTP
+UTR
+UVA
+UVB
+UWB
+UZI
+Umb
+Uni
+Upp
+Uzi
+VAC
+VAR
+VBA
+VBR
+VBS
+VCC
+VCD
+VCR
+VDC
+VDE
+VGA
+VHF
+VHS
+VIA
+VII
+VIP
+VIS
+VMw
+VOA
+VOB
+VOC
+VOD
+VOL
+VPN
+VPS
+VRP
+VSS
+VTE
+VVT
+Ver
+Vic
+Vid
+Vis
+Viv
+WAN
+WAP
+WAV
+WAY
+WBA
+WBC
+WBO
+WBS
+WCG
+WCW
+WDM
+WDS
+WEB
+WEP
+WEY
+WGK
+WHO
+WIN
+WMA
+WMS
+WMV
+WOW
+WPA
+WPF
+WPS
+WRC
+WSA
+WTA
+WTI
+WTO
+WVG
+WWE
+WWF
+WWW
+Way
+Wha
+Whe
+Whi
+Who
+Why
+WiF
+Win
+Wiz
+Wom
+Wor
+Wou
+XGA
+XII
+XML
+XPS
+XXX
+XYZ
+YAG
+YES
+YOU
+YZB
+Yin
+You
+Yua
+Yuk
+Yun
+ZIP
+ZOL
+Zer
+Zha
+Zhu
+Zom
+Zon
+Zou
+abb
+abc
+abo
+abs
+act
+adj
+aff
+all
+and
+ang
+any
+app
+aws
+bbb
+bbc
+bbq
+bbs
+but
+cAM
+cDN
+cGM
+can
+car
+cba
+cha
+chi
+col
+com
+con
+cor
+cou
+cpi
+cpu
+dan
+day
+des
+did
+dif
+dis
+div
+diy
+doc
+don
+dow
+eAA
+eSA
+ech
+eff
+emb
+emp
+end
+eng
+eqc
+equ
+euv
+eve
+exc
+exe
+exp
+fac
+fil
+fin
+fir
+fiv
+fla
+fly
+for
+fox
+fre
+fri
+gAS
+gdp
+gen
+giv
+gmp
+gon
+goo
+got
+gps
+gra
+gre
+gro
+had
+har
+has
+hav
+haz
+her
+his
+hiv
+hol
+hom
+hou
+how
+iBT
+iOS
+iPa
+iPh
+iPo
+iSC
+ima
+imp
+inc
+inf
+inj
+int
+ipa
+iph
+ipo
+isb
+iso
+jam
+jap
+jav
+jay
+jus
+kHz
+kJm
+kdj
+kin
+lay
+laz
+lck
+lea
+led
+let
+lib
+lif
+lin
+liq
+lis
+lit
+liv
+liz
+lly
+lng
+loc
+lof
+log
+loo
+los
+low
+mRN
+mac
+mad
+maj
+man
+mar
+mat
+max
+may
+maz
+mba
+men
+mic
+min
+mmH
+mod
+mon
+mor
+mys
+nVI
+nba
+nex
+nic
+not
+nov
+now
+nxp
+obj
+off
+one
+ope
+opp
+our
+out
+ove
+par
+pay
+per
+phe
+php
+piz
+pla
+pow
+ppp
+pre
+pro
+pvc
+qHD
+qgh
+qua
+que
+qui
+rRN
+ray
+raz
+rea
+rec
+red
+ref
+reg
+rem
+rep
+req
+res
+rev
+ric
+riv
+rmb
+rng
+rom
+rou
+say
+sch
+sha
+she
+shi
+sho
+sim
+sin
+siz
+som
+sou
+spa
+spe
+sql
+squ
+sta
+ste
+sto
+str
+sty
+sub
+suv
+tRN
+tha
+the
+thi
+thr
+tim
+tip
+top
+tow
+tpp
+tra
+tur
+tuv
+two
+ubc
+uiv
+unc
+und
+uni
+unk
+ups
+usb
+uva
+uvb
+uzi
+val
+var
+ver
+vie
+vip
+vis
+viv
+wan
+was
+way
+web
+wer
+wha
+whi
+who
+why
+wif
+wit
+wom
+won
+wor
+wou
+www
+xin
+xxx
+yin
+you
+zha
+zhi
+zho
+zhu
+zon
+zzf
+zzy
+AAAA
+AACS
+ABCD
+ACCA
+ACCE
+ACCP
+ACDC
+ACGN
+ACID
+ACPI
+ACTH
+ADHD
+ADPC
+ADSL
+AIDS
+AJAX
+ALPH
+AMEX
+AMOL
+ANGE
+ANSI
+ANSY
+APEC
+APPL
+APTE
+ARDS
+ARPA
+ARPG
+ASCE
+ASCI
+ASIA
+ASIC
+ASIN
+ASME
+ASSO
+ASTM
+ASUS
+AUDI
+AUTO
+AVCH
+AWAR
+Andr
+BABY
+BACK
+BAND
+BANG
+BANK
+BASI
+BASS
+BATT
+BEAS
+BEAT
+BEST
+BEYO
+BIGB
+BIOS
+BLAC
+BLEA
+BLOG
+BLOO
+BLUE
+BOBO
+BOOK
+BOOL
+BOOM
+BOPP
+BOSS
+BOYS
+BRAV
+BREA
+BUFF
+Buck
+Buff
+Bull
+Bung
+Buzz
+CADC
+CALL
+CAPC
+CAPP
+CARD
+CASE
+CASI
+CAST
+CATI
+CATV
+CAXA
+CCFL
+CCIE
+CCNA
+CCTV
+CDMA
+CEPA
+CERN
+CHAN
+CHAP
+CHAR
+CHEN
+CHIN
+CHOR
+CIMS
+CIPA
+CISC
+CITE
+CITY
+CLAM
+CLAN
+CLAS
+CLOS
+CLUB
+CMMB
+CMMI
+CMOS
+CMYK
+CNAS
+CNBC
+CNBL
+CNKI
+CNNI
+COCO
+CODE
+COLL
+COLO
+COMB
+COME
+COMI
+COMP
+CONT
+COOL
+CORB
+CORE
+COSM
+COSP
+COST
+COUN
+COVI
+CPLD
+CREA
+CROS
+CSCD
+CSDN
+CSMA
+CSOL
+CSSC
+CSTN
+CTRL
+CUBA
+CUDA
+CURR
+CVBS
+Chin
+Chur
+DANC
+DARK
+DARP
+DASH
+DATA
+DAYS
+DCDC
+DDNS
+DDOS
+DDRI
+DELL
+DEMO
+DESI
+DEST
+DHCP
+DIGI
+DIMM
+DISC
+DIVX
+DLNA
+DOHC
+DOTA
+DOWN
+DRAG
+DRAM
+DREA
+DRIV
+DSLR
+DVDC
+DVGA
+DWDM
+DWOR
+EAST
+EASY
+EBIT
+ECMO
+EDGE
+EDIT
+EDTA
+EGFR
+EINE
+ELIS
+ELLE
+EMBA
+ENER
+ENGI
+ENTE
+EPDM
+EPIS
+EPON
+EPSO
+EPUB
+ERCP
+ERRO
+ESET
+ESPN
+ETSI
+EVDO
+EVER
+EXCE
+EXIL
+EXPO
+Ever
+Exch
+Exer
+FACE
+FALS
+FANS
+FANU
+FAST
+FDDI
+FIBA
+FIDI
+FIFA
+FIFO
+FILE
+FINA
+FIRE
+FIRS
+FISH
+FIVE
+FLAC
+FLAS
+FLOW
+FMVP
+FORT
+FPGA
+FREE
+FROM
+FTTH
+FULL
+FWVG
+FXCM
+Fuck
+Full
+Fund
+Fung
+Fuzz
+GABA
+GALA
+GAME
+GANK
+GATT
+GEAR
+GENE
+GHOS
+GIRL
+GLON
+GMAT
+GNSS
+GOLD
+GOOD
+GOOG
+GPRS
+GREE
+GROU
+GSMG
+GUCC
+GUND
+GUTS
+Gund
+HACC
+HAPP
+HARD
+HART
+HDCP
+HDMI
+HDPE
+HDTV
+HEAD
+HEAR
+HELL
+HEPA
+HERO
+HIFI
+HIGH
+HIPH
+HKEY
+HOLD
+HOME
+HOST
+HOUS
+HPLC
+HSDP
+HSPA
+HTML
+HTTP
+HUNT
+Hugh
+Hung
+ICAN
+ICMP
+ICON
+IDEA
+IDOL
+IEEE
+IELT
+IETF
+IFPI
+IGBT
+IGMP
+IMAX
+IMDB
+INFO
+INTE
+IPAD
+IPTV
+ISBN
+ISDN
+ISIS
+ISOI
+ISRC
+ISSN
+ISTP
+ITER
+ITIL
+IUCN
+Inte
+Inve
+JACK
+JAPA
+JAVA
+JAZZ
+JBOD
+JOHN
+JOJO
+JOKE
+JOUR
+JPEG
+JUMP
+JUST
+Jack
+Jake
+Jazz
+John
+Joke
+July
+Jump
+Jung
+KING
+KISS
+KONA
+KOYO
+LASI
+LAST
+LEED
+LEEP
+LESS
+LEVE
+LEXU
+LIFE
+LIKE
+LIMI
+LINE
+LINK
+LINU
+LIST
+LIVE
+LLDP
+LOCA
+LOFT
+LOGO
+LOLI
+LONG
+LOOK
+LOVE
+LPGA
+LTPS
+LVDS
+Ligh
+Like
+Lily
+Lind
+Ling
+Liqu
+Live
+Luck
+Luke
+MACD
+MACH
+MAGI
+MALL
+MAMA
+MARK
+MAST
+MATL
+MATX
+MAYA
+MBLA
+MEDI
+MEGA
+MEMS
+MERS
+META
+MIDI
+MIDP
+MIMO
+MINI
+MIPS
+MISS
+MIUI
+MMOR
+MOBA
+MODB
+MODE
+MOMO
+MOOC
+MOON
+MORE
+MOSF
+MOTO
+MOVI
+MPEG
+MPLS
+MSCI
+MSDS
+MTBF
+MUSI
+Mach
+Make
+Maur
+Mazz
+NACH
+NADH
+NADP
+NAMC
+NAME
+NANA
+NAND
+NASA
+NASD
+NATO
+NAVE
+NCAA
+NCAP
+NCIS
+NEDC
+NEOP
+NERV
+NEST
+NEWS
+NEXT
+NICO
+NIGH
+NIKE
+NINE
+NOKI
+NOTE
+NOVA
+NSAI
+NTFS
+NTSC
+NULL
+NURB
+NVID
+NYSE
+Nove
+ODBC
+OECD
+OFDM
+OFFI
+OLAP
+OLED
+ONLI
+ONLY
+OPEC
+OPEN
+OPPO
+ORAC
+ORIC
+ORIG
+OSPF
+OVER
+Oper
+PACS
+PAGE
+PARK
+PART
+PASS
+PCMC
+PDCA
+PEEK
+PERC
+PERF
+PETS
+PHEV
+PHIL
+PHOT
+PICC
+PIEC
+PLAN
+PLAY
+PLUS
+PMMA
+PNAS
+POLO
+POSE
+POST
+POWE
+PPTP
+PPTV
+PRAD
+PROD
+PROF
+PROJ
+PSTN
+PTFE
+PUNK
+PVDF
+Pric
+Prin
+Priv
+Priz
+Prom
+QFII
+QVGA
+QVOD
+QWER
+Quic
+Quin
+Quiz
+RADI
+RAID
+RAIN
+REAC
+READ
+REAL
+REIT
+RESE
+RFID
+RIDE
+RISC
+RMON
+RMRM
+RMVB
+ROAD
+ROCK
+ROHS
+ROOT
+ROSE
+RTEC
+RWBY
+Ruby
+SAAS
+SAMS
+SARS
+SATA
+SCAD
+SCAR
+SCDM
+SCHO
+SCIE
+SCSI
+SDHC
+SDMM
+SDRA
+SDSD
+SDXC
+SECA
+SECC
+SECT
+SEED
+SEGA
+SELE
+SERV
+SEVE
+SFDA
+SHIF
+SHIN
+SHOC
+SHOP
+SHOW
+SIDE
+SIEM
+SING
+SIZE
+SKIP
+SMAP
+SMAR
+SMIL
+SMTP
+SNMP
+SOAP
+SOCK
+SOHO
+SOLO
+SONG
+SONY
+SOSO
+SOUL
+SPAC
+SPCC
+SPDI
+SPEC
+SPEE
+SPIE
+SPOR
+SPSS
+SRAM
+SSCI
+STAF
+STAG
+STAR
+STAT
+STEM
+STEP
+STER
+STOP
+STOR
+STUD
+STYL
+SUMM
+SUPE
+SUSE
+SWAT
+SWIF
+SWOT
+SYST
+Subj
+Sull
+Sund
+Sung
+Supp
+TABL
+TANK
+TCPI
+TDMA
+TEAM
+TECH
+TEST
+TEXT
+TFBO
+TFSI
+TFTP
+THIS
+THRE
+TIFF
+TIME
+TIMK
+TIPS
+TOEF
+TOKY
+TOSH
+TOUC
+TOUR
+TOWN
+TRAC
+TRIP
+TRIZ
+TRUE
+TVBS
+TVOC
+TWIC
+TYPE
+Ther
+Thin
+Thom
+Thou
+UCLA
+UHMW
+ULTR
+UMTS
+UNES
+UNIT
+UNIV
+UNIX
+Unic
+Unit
+Univ
+VAIO
+VCCI
+VEGF
+VERS
+VHDL
+VIDE
+VIER
+VIII
+VISA
+VISI
+VIST
+VIVO
+VLAN
+VLSI
+VOCA
+VOGU
+VOIP
+VRay
+VSAT
+Vick
+Vill
+WANG
+WAPI
+WASD
+WAVE
+WCBA
+WCDM
+WEEK
+WEST
+WHAT
+WHIT
+WIFI
+WIND
+WITH
+WLAN
+WORD
+WORK
+WORL
+WQVG
+WXGA
+Wang
+Wher
+WiMA
+Will
+Wind
+Wing
+XBOX
+XBRL
+XHTM
+XVID
+XXXX
+YAMA
+YANG
+YEAH
+YONE
+YOUN
+YOUR
+YOYO
+Yong
+Your
+ZAFT
+ZARA
+ZERO
+ZGMF
+ZHAN
+ZONE
+Zhon
+Zhou
+abby
+abou
+andr
+appl
+baby
+back
+blic
+call
+char
+chic
+chin
+coff
+coll
+comb
+comm
+comp
+cond
+cons
+cont
+dick
+diff
+ding
+dock
+doin
+dong
+down
+ever
+exch
+find
+foll
+four
+from
+fron
+goin
+good
+goog
+gove
+hack
+hall
+hand
+hang
+happ
+have
+here
+high
+home
+into
+inve
+jack
+java
+jazz
+jump
+jung
+just
+know
+life
+ligh
+like
+lily
+ling
+liqu
+live
+lock
+logo
+lond
+long
+look
+love
+macd
+mach
+make
+mapp
+mmer
+nove
+okay
+only
+oper
+oppo
+othe
+over
+play
+pray
+pric
+prin
+priv
+priz
+prod
+prom
+quic
+real
+requ
+righ
+scho
+shou
+show
+some
+star
+stat
+stay
+stom
+subj
+such
+suff
+supp
+take
+than
+they
+thin
+thou
+toke
+uber
+unic
+univ
+upon
+usdj
+user
+usin
+vill
+vivo
+wake
+wall
+wang
+want
+wave
+were
+what
+when
+wifi
+will
+wind
+wing
+with
+work
+xing
+xxxx
+year
+your
+zhon
+China
+Inter
+Journ
+china
+every
+inter
+iphon
+thing
+think
+where
+which
+Univer
+univer
+Windows
+windows
+##A
+##B
+##C
+##D
+##E
+##F
+##G
+##H
+##I
+##J
+##K
+##L
+##M
+##N
+##O
+##P
+##Q
+##R
+##S
+##T
+##U
+##V
+##W
+##X
+##Y
+##Z
+##a
+##b
+##c
+##d
+##e
+##f
+##g
+##h
+##i
+##j
+##k
+##l
+##m
+##n
+##o
+##p
+##q
+##r
+##s
+##t
+##u
+##v
+##w
+##x
+##y
+##z
+##AA
+##AB
+##AC
+##AD
+##AE
+##AF
+##AG
+##AH
+##AI
+##AK
+##AL
+##AM
+##AN
+##AO
+##AP
+##AQ
+##AR
+##AS
+##AT
+##AV
+##AW
+##AX
+##AY
+##AZ
+##BA
+##BB
+##BC
+##BE
+##BG
+##BI
+##BM
+##BN
+##BO
+##BP
+##BR
+##BS
+##BT
+##BU
+##BY
+##CA
+##CB
+##CC
+##CD
+##CE
+##CF
+##CG
+##CH
+##CI
+##CK
+##CL
+##CM
+##CN
+##CO
+##CP
+##CR
+##CS
+##CT
+##CU
+##DA
+##DB
+##DC
+##DD
+##DE
+##DI
+##DL
+##DM
+##DN
+##DO
+##DP
+##DR
+##DS
+##DT
+##DU
+##DX
+##DY
+##EA
+##EB
+##EC
+##ED
+##EE
+##EF
+##EG
+##EI
+##EK
+##EL
+##EM
+##EN
+##EO
+##EP
+##ER
+##ES
+##ET
+##EV
+##EW
+##EX
+##EY
+##FA
+##FC
+##FD
+##FE
+##FF
+##FI
+##FL
+##FO
+##FP
+##FR
+##FS
+##FT
+##FU
+##FX
+##Fi
+##GA
+##GC
+##GE
+##GF
+##GH
+##GI
+##GL
+##GN
+##GO
+##GP
+##GR
+##GS
+##GU
+##GY
+##HA
+##HC
+##HD
+##HE
+##HG
+##HI
+##HM
+##HN
+##HO
+##HP
+##HR
+##HS
+##HT
+##IA
+##IB
+##IC
+##ID
+##IE
+##IF
+##IG
+##II
+##IK
+##IL
+##IM
+##IN
+##IO
+##IP
+##IR
+##IS
+##IT
+##IU
+##IV
+##IX
+##IZ
+##JI
+##JO
+##Jo
+##Ju
+##KA
+##KE
+##KI
+##KK
+##KO
+##KU
+##KY
+##LA
+##LC
+##LD
+##LE
+##LF
+##LG
+##LI
+##LK
+##LL
+##LM
+##LO
+##LP
+##LS
+##LT
+##LU
+##LV
+##LY
+##MA
+##MB
+##MC
+##MD
+##ME
+##MF
+##MI
+##ML
+##MM
+##MN
+##MO
+##MP
+##MR
+##MS
+##MT
+##MV
+##MY
+##NA
+##NC
+##ND
+##NE
+##NG
+##NI
+##NJ
+##NK
+##NN
+##NO
+##NP
+##NS
+##NT
+##NU
+##NX
+##NY
+##NZ
+##OB
+##OC
+##OD
+##OE
+##OF
+##OG
+##OH
+##OI
+##OK
+##OL
+##OM
+##ON
+##OO
+##OP
+##OR
+##OS
+##OT
+##OU
+##OV
+##OW
+##OX
+##PA
+##PC
+##PD
+##PE
+##PF
+##PG
+##PH
+##PI
+##PL
+##PM
+##PO
+##PP
+##PR
+##PS
+##PT
+##PU
+##QU
+##Qu
+##RA
+##RB
+##RC
+##RD
+##RE
+##RF
+##RG
+##RH
+##RI
+##RK
+##RL
+##RM
+##RN
+##RO
+##RP
+##RR
+##RS
+##RT
+##RU
+##RY
+##SA
+##SB
+##SC
+##SD
+##SE
+##SF
+##SH
+##SI
+##SK
+##SL
+##SM
+##SN
+##SO
+##SP
+##SS
+##ST
+##SU
+##SY
+##TA
+##TC
+##TD
+##TE
+##TH
+##TI
+##TM
+##TO
+##TP
+##TR
+##TS
+##TT
+##TU
+##TV
+##TY
+##Tu
+##UB
+##UC
+##UD
+##UE
+##UF
+##UG
+##UI
+##UK
+##UL
+##UM
+##UN
+##UP
+##UR
+##US
+##UT
+##VA
+##VB
+##VC
+##VD
+##VE
+##VI
+##VO
+##VP
+##VR
+##VT
+##WA
+##WC
+##WE
+##WF
+##WI
+##WL
+##WM
+##WO
+##WS
+##XA
+##XC
+##XE
+##XG
+##XO
+##XP
+##XT
+##XX
+##XY
+##YA
+##YE
+##YL
+##YO
+##YP
+##YS
+##YT
+##ZA
+##ZB
+##ZE
+##ZI
+##ZO
+##ZR
+##ZU
+##ZX
+##ZZ
+##ab
+##ag
+##al
+##am
+##an
+##ar
+##as
+##at
+##ax
+##ay
+##az
+##bi
+##bj
+##bl
+##bo
+##by
+##ce
+##ch
+##ci
+##ck
+##cq
+##ct
+##dj
+##ed
+##en
+##er
+##ew
+##ex
+##ff
+##fi
+##gh
+##gn
+##ha
+##he
+##ho
+##hz
+##ic
+##id
+##im
+##in
+##is
+##it
+##iv
+##ix
+##iz
+##jj
+##jo
+##ke
+##ky
+##kz
+##ld
+##le
+##lf
+##ll
+##ly
+##mb
+##mp
+##na
+##nc
+##nd
+##ng
+##nj
+##nk
+##nn
+##nt
+##nz
+##ob
+##oj
+##ok
+##ol
+##om
+##on
+##op
+##or
+##ou
+##ow
+##ox
+##ph
+##pp
+##pu
+##pv
+##qf
+##ql
+##qq
+##qu
+##re
+##rk
+##ro
+##ry
+##sh
+##sq
+##st
+##th
+##ty
+##ub
+##ul
+##um
+##un
+##ur
+##us
+##uv
+##ux
+##uz
+##ve
+##vi
+##wn
+##ws
+##ww
+##xp
+##xx
+##xy
+##zh
+##zy
+##zz
+##ACE
+##ACH
+##ACT
+##ADE
+##AGE
+##AIN
+##AME
+##AND
+##ANG
+##ANO
+##ANT
+##ARD
+##ARE
+##ASS
+##AST
+##ATE
+##BER
+##BLE
+##BOX
+##BSD
+##Bay
+##CAD
+##CAL
+##CAM
+##COM
+##CSE
+##DEO
+##DER
+##DIA
+##DNA
+##DSL
+##DVD
+##EAM
+##EAR
+##ECT
+##EEN
+##ENS
+##ENT
+##ERA
+##ERS
+##ESE
+##ESS
+##FTA
+##GER
+##GHT
+##GIS
+##IAL
+##IBA
+##IBU
+##ICE
+##ICS
+##IDE
+##INA
+##INE
+##ING
+##INT
+##INY
+##ION
+##IPS
+##ITE
+##IVE
+##KER
+##KON
+##LAY
+##LLA
+##LOR
+##MAN
+##MAS
+##MAX
+##MES
+##NAD
+##NAL
+##NCE
+##NET
+##NEY
+##NIC
+##NNA
+##OCK
+##ODE
+##OME
+##ONE
+##ORA
+##OWS
+##Off
+##PAC
+##PER
+##PRS
+##RAN
+##RIS
+##RNA
+##ROM
+##RON
+##ROR
+##SCO
+##SHI
+##SIC
+##SOL
+##SON
+##SQL
+##TAL
+##TED
+##TER
+##TML
+##TON
+##TRA
+##UND
+##UNG
+##UPA
+##USB
+##USE
+##VEL
+##VER
+##VGA
+##VID
+##WER
+##You
+##abl
+##aby
+##ach
+##ack
+##act
+##ain
+##ake
+##all
+##aly
+##anc
+##and
+##ang
+##ank
+##app
+##ard
+##ark
+##art
+##ary
+##ash
+##ath
+##auv
+##ave
+##avi
+##azi
+##azy
+##azz
+##bVI
+##bby
+##ber
+##bje
+##ble
+##cGI
+##cho
+##com
+##cqu
+##day
+##der
+##ebo
+##ect
+##ell
+##emb
+##enc
+##eng
+##ent
+##erJ
+##ern
+##erv
+##ery
+##eve
+##ews
+##exp
+##ext
+##ezy
+##fer
+##ffe
+##fic
+##for
+##gaz
+##ger
+##ght
+##gin
+##hen
+##her
+##hev
+##hin
+##hon
+##hou
+##iRF
+##ial
+##ica
+##ice
+##ich
+##ick
+##iff
+##igh
+##ike
+##ill
+##ily
+##ime
+##ine
+##ing
+##ink
+##ion
+##iqu
+##ish
+##ith
+##ive
+##iza
+##ize
+##izz
+##jin
+##ker
+##kin
+##lDR
+##lay
+##laz
+##lex
+##lic
+##lin
+##liz
+##llo
+##lly
+##man
+##maz
+##men
+##mer
+##min
+##mpl
+##mpo
+##nGL
+##nRH
+##nal
+##ner
+##ngz
+##niz
+##now
+##nxp
+##oCA
+##obj
+##ock
+##oll
+##omb
+##ome
+##omm
+##omp
+##one
+##ong
+##ook
+##ork
+##orm
+##ort
+##ory
+##oul
+##oup
+##our
+##ous
+##out
+##ove
+##own
+##ows
+##per
+##phe
+##ply
+##por
+##ppl
+##ppy
+##qqu
+##qua
+##que
+##qui
+##raz
+##rch
+##ric
+##rou
+##son
+##tBI
+##tch
+##ter
+##the
+##tic
+##tim
+##tiv
+##tur
+##uch
+##uck
+##uct
+##uff
+##ugh
+##umb
+##ung
+##ure
+##urn
+##vel
+##ven
+##ver
+##vic
+##vid
+##vin
+##war
+##way
+##whe
+##wor
+##www
+##xxx
+##ymb
+##yth
+##zhe
+##zym
+##zzy
+##ATIO
+##CESS
+##CIAT
+##CTIO
+##CTOR
+##ENGI
+##ERSI
+##HCSD
+##INES
+##INUE
+##IONA
+##LOID
+##MENT
+##NEER
+##NOLO
+##NTER
+##NTSC
+##ORMA
+##OSHO
+##RISE
+##RNAT
+##RNET
+##SATA
+##SION
+##TION
+##TTLE
+##VERS
+##ally
+##arch
+##ayer
+##azer
+##azin
+##bert
+##book
+##chin
+##ctor
+##ding
+##echn
+##erPC
+##erVR
+##eriz
+##erve
+##ever
+##ffer
+##ffff
+##ffic
+##fter
+##ghly
+##hell
+##ical
+##iche
+##icke
+##ific
+##ight
+##iver
+##izon
+##izzy
+##king
+##lack
+##land
+##llow
+##mber
+##ngin
+##ning
+##omic
+##onom
+##othe
+##ouch
+##ough
+##ound
+##ower
+##pper
+##ppin
+##pter
+##ster
+##ther
+##tion
+##tive
+##tter
+##ture
+##urch
+##vely
+##ction
+##ctive
+##enter
+##erica
+##ional
+##thing
diff --git a/fengshen/workspace/randeng-bart-base/pretrain/config.json b/fengshen/workspace/randeng-bart-base/pretrain/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..693bf7a8ab25b35bea914122c631a4a9d38204ec
--- /dev/null
+++ b/fengshen/workspace/randeng-bart-base/pretrain/config.json
@@ -0,0 +1,52 @@
+{
+    "_name_or_path": "bart-base",
+    "activation_dropout": 0.1,
+    "activation_function": "gelu",
+    "add_bias_logits": false,
+    "add_final_layer_norm": false,
+    "architectures": [
+        "BartForConditionalGeneration"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 0,
+    "classif_dropout": 0.1,
+    "classifier_dropout": 0.0,
+    "d_model": 768,
+    "decoder_attention_heads": 12,
+    "decoder_ffn_dim": 3072,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 6,
+    "decoder_start_token_id": 2,
+    "dropout": 0.1,
+    "encoder_attention_heads": 12,
+    "encoder_ffn_dim": 3072,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 6,
+    "eos_token_id": 2,
+    "forced_eos_token_id": 2,
+    "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1",
+        "2": "LABEL_2"
+    },
+    "init_std": 0.02,
+    "is_encoder_decoder": true,
+    "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1,
+        "LABEL_2": 2
+    },
+    "max_position_embeddings": 1024,
+    "model_type": "bart",
+    "no_repeat_ngram_size": 3,
+    "normalize_before": false,
+    "normalize_embedding": true,
+    "num_beams": 4,
+    "num_hidden_layers": 6,
+    "pad_token_id": 1,
+    "scale_embedding": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.16.0.dev0",
+    "use_cache": true,
+    "vocab_size": 21128
+}
\ No newline at end of file
diff --git a/fengshen/workspace/randeng-bart-base/pretrain/special_tokens_map.json b/fengshen/workspace/randeng-bart-base/pretrain/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7b0375001f109a6b8873d756ad4f7bbb15fbaa5
--- /dev/null
+++ b/fengshen/workspace/randeng-bart-base/pretrain/special_tokens_map.json
@@ -0,0 +1 @@
+{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
\ No newline at end of file
diff --git a/fengshen/workspace/randeng-bart-base/pretrain/tokenizer_config.json b/fengshen/workspace/randeng-bart-base/pretrain/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..53d88ea9bb5c978402b7d9bb4a80690171e5491c
--- /dev/null
+++ b/fengshen/workspace/randeng-bart-base/pretrain/tokenizer_config.json
@@ -0,0 +1,15 @@
+{
+    "do_lower_case": true,
+    "do_basic_tokenize": true,
+    "never_split": null,
+    "unk_token": "[UNK]",
+    "sep_token": "[SEP]",
+    "pad_token": "[PAD]",
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]",
+    "tokenize_chinese_chars": true,
+    "strip_accents": null,
+    "special_tokens_map_file": null,
+    "name_or_path": "/cognitive_comp/gaoxinyu/pretrained_model/bert-1.3B",
+    "tokenizer_class": "BertTokenizer"
+}
\ No newline at end of file
diff --git a/fengshen/workspace/randeng-bart-base/pretrain/vocab.txt b/fengshen/workspace/randeng-bart-base/pretrain/vocab.txt
new file mode 100644
index 0000000000000000000000000000000000000000..66b6d20eebda7da8499e68ef9b1980990c0042cc
--- /dev/null
+++ b/fengshen/workspace/randeng-bart-base/pretrain/vocab.txt
@@ -0,0 +1,21128 @@
+[PAD]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+<S>
+<T>
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+£
+¤
+¥
+§
+©
+«
+®
+°
+±
+²
+³
+µ
+·
+¹
+º
+»
+¼
+×
+ß
+æ
+÷
+ø
+đ
+ŋ
+ɔ
+ə
+ɡ
+ʰ
+ˇ
+ˈ
+ˊ
+ˋ
+ˍ
+ː
+˙
+˚
+ˢ
+α
+β
+γ
+δ
+ε
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+ы
+ь
+я
+і
+ا
+ب
+ة
+ت
+د
+ر
+س
+ع
+ل
+م
+ن
+ه
+و
+ي
+۩
+ก
+ง
+น
+ม
+ย
+ร
+อ
+า
+เ
+๑
+་
+ღ
+ᄀ
+ᄁ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄈ
+ᄉ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅣ
+ᅥ
+ᅦ
+ᅧ
+ᅨ
+ᅩ
+ᅪ
+ᅬ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆺ
+ᆻ
+ᆼ
+ᗜ
+ᵃ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵘ
+‖
+„
+†
+•
+‥
+‧
+
+‰
+′
+″
+‹
+›
+※
+‿
+⁄
+ⁱ
+⁺
+ⁿ
+₁
+₂
+₃
+₄
+€
+℃
+№
+™
+ⅰ
+ⅱ
+ⅲ
+ⅳ
+ⅴ
+←
+↑
+→
+↓
+↔
+↗
+↘
+⇒
+∀
+−
+∕
+∙
+√
+∞
+∟
+∠
+∣
+∥
+∩
+∮
+∶
+∼
+∽
+≈
+≒
+≡
+≤
+≥
+≦
+≧
+≪
+≫
+⊙
+⋅
+⋈
+⋯
+⌒
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+⑴
+⑵
+⑶
+⑷
+⑸
+⒈
+⒉
+⒊
+⒋
+ⓒ
+ⓔ
+ⓘ
+─
+━
+│
+┃
+┅
+┆
+┊
+┌
+└
+├
+┣
+═
+║
+╚
+╞
+╠
+╭
+╮
+╯
+╰
+╱
+╳
+▂
+▃
+▅
+▇
+█
+▉
+▋
+▌
+▍
+▎
+■
+□
+▪
+▫
+▬
+▲
+△
+▶
+►
+▼
+▽
+◆
+◇
+○
+◎
+●
+◕
+◠
+◢
+◤
+☀
+★
+☆
+☕
+☞
+☺
+☼
+♀
+♂
+♠
+♡
+♣
+♥
+♦
+♪
+♫
+♬
+✈
+✔
+✕
+✖
+✦
+✨
+✪
+✰
+✿
+❀
+❤
+➜
+➤
+⦿
+、
+。
+〃
+々
+〇
+〈
+〉
+《
+》
+「
+」
+『
+』
+【
+】
+〓
+〔
+〕
+〖
+〗
+〜
+〝
+〞
+ぁ
+あ
+ぃ
+い
+う
+ぇ
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゅ
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+゜
+ゝ
+ァ
+ア
+ィ
+イ
+ゥ
+ウ
+ェ
+エ
+ォ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+ソ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ヌ
+ネ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ヲ
+ン
+ヶ
+・
+ー
+ヽ
+ㄅ
+ㄆ
+ㄇ
+ㄉ
+ㄋ
+ㄌ
+ㄍ
+ㄎ
+ㄏ
+ㄒ
+ㄚ
+ㄛ
+ㄞ
+ㄟ
+ㄢ
+ㄤ
+ㄥ
+ㄧ
+ㄨ
+ㆍ
+㈦
+㊣
+㎡
+㗎
+一
+丁
+七
+万
+丈
+三
+上
+下
+不
+与
+丐
+丑
+专
+且
+丕
+世
+丘
+丙
+业
+丛
+东
+丝
+丞
+丟
+両
+丢
+两
+严
+並
+丧
+丨
+个
+丫
+中
+丰
+串
+临
+丶
+丸
+丹
+为
+主
+丼
+丽
+举
+丿
+乂
+乃
+久
+么
+义
+之
+乌
+乍
+乎
+乏
+乐
+乒
+乓
+乔
+乖
+乗
+乘
+乙
+乜
+九
+乞
+也
+习
+乡
+书
+乩
+买
+乱
+乳
+乾
+亀
+亂
+了
+予
+争
+事
+二
+于
+亏
+云
+互
+五
+井
+亘
+亙
+亚
+些
+亜
+亞
+亟
+亡
+亢
+交
+亥
+亦
+产
+亨
+亩
+享
+京
+亭
+亮
+亲
+亳
+亵
+人
+亿
+什
+仁
+仃
+仄
+仅
+仆
+仇
+今
+介
+仍
+从
+仏
+仑
+仓
+仔
+仕
+他
+仗
+付
+仙
+仝
+仞
+仟
+代
+令
+以
+仨
+仪
+们
+仮
+仰
+仲
+件
+价
+任
+份
+仿
+企
+伉
+伊
+伍
+伎
+伏
+伐
+休
+伕
+众
+优
+伙
+会
+伝
+伞
+伟
+传
+伢
+伤
+伦
+伪
+伫
+伯
+估
+伴
+伶
+伸
+伺
+似
+伽
+佃
+但
+佇
+佈
+位
+低
+住
+佐
+佑
+体
+佔
+何
+佗
+佘
+余
+佚
+佛
+作
+佝
+佞
+佟
+你
+佢
+佣
+佤
+佥
+佩
+佬
+佯
+佰
+佳
+併
+佶
+佻
+佼
+使
+侃
+侄
+來
+侈
+例
+侍
+侏
+侑
+侖
+侗
+供
+依
+侠
+価
+侣
+侥
+侦
+侧
+侨
+侬
+侮
+侯
+侵
+侶
+侷
+便
+係
+促
+俄
+俊
+俎
+俏
+俐
+俑
+俗
+俘
+俚
+保
+俞
+俟
+俠
+信
+俨
+俩
+俪
+俬
+俭
+修
+俯
+俱
+俳
+俸
+俺
+俾
+倆
+倉
+個
+倌
+倍
+倏
+們
+倒
+倔
+倖
+倘
+候
+倚
+倜
+借
+倡
+値
+倦
+倩
+倪
+倫
+倬
+倭
+倶
+债
+值
+倾
+偃
+假
+偈
+偉
+偌
+偎
+偏
+偕
+做
+停
+健
+側
+偵
+偶
+偷
+偻
+偽
+偿
+傀
+傅
+傍
+傑
+傘
+備
+傚
+傢
+傣
+傥
+储
+傩
+催
+傭
+傲
+傳
+債
+傷
+傻
+傾
+僅
+働
+像
+僑
+僕
+僖
+僚
+僥
+僧
+僭
+僮
+僱
+僵
+價
+僻
+儀
+儂
+億
+儆
+儉
+儋
+儒
+儕
+儘
+償
+儡
+優
+儲
+儷
+儼
+儿
+兀
+允
+元
+兄
+充
+兆
+兇
+先
+光
+克
+兌
+免
+児
+兑
+兒
+兔
+兖
+党
+兜
+兢
+入
+內
+全
+兩
+八
+公
+六
+兮
+兰
+共
+兲
+关
+兴
+兵
+其
+具
+典
+兹
+养
+兼
+兽
+冀
+内
+円
+冇
+冈
+冉
+冊
+册
+再
+冏
+冒
+冕
+冗
+写
+军
+农
+冠
+冢
+冤
+冥
+冨
+冪
+冬
+冯
+冰
+冲
+决
+况
+冶
+冷
+冻
+冼
+冽
+冾
+净
+凄
+准
+凇
+凈
+凉
+凋
+凌
+凍
+减
+凑
+凛
+凜
+凝
+几
+凡
+凤
+処
+凪
+凭
+凯
+凰
+凱
+凳
+凶
+凸
+凹
+出
+击
+函
+凿
+刀
+刁
+刃
+分
+切
+刈
+刊
+刍
+刎
+刑
+划
+列
+刘
+则
+刚
+创
+初
+删
+判
+別
+刨
+利
+刪
+别
+刮
+到
+制
+刷
+券
+刹
+刺
+刻
+刽
+剁
+剂
+剃
+則
+剉
+削
+剋
+剌
+前
+剎
+剐
+剑
+剔
+剖
+剛
+剜
+剝
+剣
+剤
+剥
+剧
+剩
+剪
+副
+割
+創
+剷
+剽
+剿
+劃
+劇
+劈
+劉
+劊
+劍
+劏
+劑
+力
+劝
+办
+功
+加
+务
+劣
+动
+助
+努
+劫
+劭
+励
+劲
+劳
+労
+劵
+効
+劾
+势
+勁
+勃
+勇
+勉
+勋
+勐
+勒
+動
+勖
+勘
+務
+勛
+勝
+勞
+募
+勢
+勤
+勧
+勳
+勵
+勸
+勺
+勻
+勾
+勿
+匀
+包
+匆
+匈
+匍
+匐
+匕
+化
+北
+匙
+匝
+匠
+匡
+匣
+匪
+匮
+匯
+匱
+匹
+区
+医
+匾
+匿
+區
+十
+千
+卅
+升
+午
+卉
+半
+卍
+华
+协
+卑
+卒
+卓
+協
+单
+卖
+南
+単
+博
+卜
+卞
+卟
+占
+卡
+卢
+卤
+卦
+卧
+卫
+卮
+卯
+印
+危
+即
+却
+卵
+卷
+卸
+卻
+卿
+厂
+厄
+厅
+历
+厉
+压
+厌
+厕
+厘
+厚
+厝
+原
+厢
+厥
+厦
+厨
+厩
+厭
+厮
+厲
+厳
+去
+县
+叁
+参
+參
+又
+叉
+及
+友
+双
+反
+収
+发
+叔
+取
+受
+变
+叙
+叛
+叟
+叠
+叡
+叢
+口
+古
+句
+另
+叨
+叩
+只
+叫
+召
+叭
+叮
+可
+台
+叱
+史
+右
+叵
+叶
+号
+司
+叹
+叻
+叼
+叽
+吁
+吃
+各
+吆
+合
+吉
+吊
+吋
+同
+名
+后
+吏
+吐
+向
+吒
+吓
+吕
+吖
+吗
+君
+吝
+吞
+吟
+吠
+吡
+否
+吧
+吨
+吩
+含
+听
+吭
+吮
+启
+吱
+吳
+吴
+吵
+吶
+吸
+吹
+吻
+吼
+吽
+吾
+呀
+呂
+呃
+呆
+呈
+告
+呋
+呎
+呐
+呓
+呕
+呗
+员
+呛
+呜
+呢
+呤
+呦
+周
+呱
+呲
+味
+呵
+呷
+呸
+呻
+呼
+命
+咀
+咁
+咂
+咄
+咆
+咋
+和
+咎
+咏
+咐
+咒
+咔
+咕
+咖
+咗
+咘
+咙
+咚
+咛
+咣
+咤
+咦
+咧
+咨
+咩
+咪
+咫
+咬
+咭
+咯
+咱
+咲
+咳
+咸
+咻
+咽
+咿
+哀
+品
+哂
+哄
+哆
+哇
+哈
+哉
+哋
+哌
+响
+哎
+哏
+哐
+哑
+哒
+哔
+哗
+哟
+員
+哥
+哦
+哧
+哨
+哩
+哪
+哭
+哮
+哲
+哺
+哼
+哽
+唁
+唄
+唆
+唇
+唉
+唏
+唐
+唑
+唔
+唠
+唤
+唧
+唬
+售
+唯
+唰
+唱
+唳
+唷
+唸
+唾
+啃
+啄
+商
+啉
+啊
+問
+啓
+啕
+啖
+啜
+啞
+啟
+啡
+啤
+啥
+啦
+啧
+啪
+啫
+啬
+啮
+啰
+啱
+啲
+啵
+啶
+啷
+啸
+啻
+啼
+啾
+喀
+喂
+喃
+善
+喆
+喇
+喉
+喊
+喋
+喎
+喏
+喔
+喘
+喙
+喚
+喜
+喝
+喟
+喧
+喪
+喫
+喬
+單
+喰
+喱
+喲
+喳
+喵
+営
+喷
+喹
+喺
+喻
+喽
+嗅
+嗆
+嗇
+嗎
+嗑
+嗒
+嗓
+嗔
+嗖
+嗚
+嗜
+嗝
+嗟
+嗡
+嗣
+嗤
+嗦
+嗨
+嗪
+嗬
+嗯
+嗰
+嗲
+嗳
+嗶
+嗷
+嗽
+嘀
+嘅
+嘆
+嘈
+嘉
+嘌
+嘍
+嘎
+嘔
+嘖
+嘗
+嘘
+嘚
+嘛
+嘜
+嘞
+嘟
+嘢
+嘣
+嘤
+嘧
+嘩
+嘭
+嘮
+嘯
+嘰
+嘱
+嘲
+嘴
+嘶
+嘸
+嘹
+嘻
+嘿
+噁
+噌
+噎
+噓
+噔
+噗
+噙
+噜
+噠
+噢
+噤
+器
+噩
+噪
+噬
+噱
+噴
+噶
+噸
+噹
+噻
+噼
+嚀
+嚇
+嚎
+嚏
+嚐
+嚓
+嚕
+嚟
+嚣
+嚥
+嚨
+嚮
+嚴
+嚷
+嚼
+囂
+囉
+囊
+囍
+囑
+囔
+囗
+囚
+四
+囝
+回
+囟
+因
+囡
+团
+団
+囤
+囧
+囪
+囫
+园
+困
+囱
+囲
+図
+围
+囹
+固
+国
+图
+囿
+圃
+圄
+圆
+圈
+國
+圍
+圏
+園
+圓
+圖
+團
+圜
+土
+圣
+圧
+在
+圩
+圭
+地
+圳
+场
+圻
+圾
+址
+坂
+均
+坊
+坍
+坎
+坏
+坐
+坑
+块
+坚
+坛
+坝
+坞
+坟
+坠
+坡
+坤
+坦
+坨
+坪
+坯
+坳
+坵
+坷
+垂
+垃
+垄
+型
+垒
+垚
+垛
+垠
+垢
+垣
+垦
+垩
+垫
+垭
+垮
+垵
+埂
+埃
+埋
+城
+埔
+埕
+埗
+域
+埠
+埤
+埵
+執
+埸
+培
+基
+埼
+堀
+堂
+堃
+堅
+堆
+堇
+堑
+堕
+堙
+堡
+堤
+堪
+堯
+堰
+報
+場
+堵
+堺
+堿
+塊
+塌
+塑
+塔
+塗
+塘
+塚
+塞
+塢
+塩
+填
+塬
+塭
+塵
+塾
+墀
+境
+墅
+墉
+墊
+墒
+墓
+増
+墘
+墙
+墜
+增
+墟
+墨
+墩
+墮
+墳
+墻
+墾
+壁
+壅
+壆
+壇
+壊
+壑
+壓
+壕
+壘
+壞
+壟
+壢
+壤
+壩
+士
+壬
+壮
+壯
+声
+売
+壳
+壶
+壹
+壺
+壽
+处
+备
+変
+复
+夏
+夔
+夕
+外
+夙
+多
+夜
+够
+夠
+夢
+夥
+大
+天
+太
+夫
+夭
+央
+夯
+失
+头
+夷
+夸
+夹
+夺
+夾
+奂
+奄
+奇
+奈
+奉
+奋
+奎
+奏
+奐
+契
+奔
+奕
+奖
+套
+奘
+奚
+奠
+奢
+奥
+奧
+奪
+奬
+奮
+女
+奴
+奶
+奸
+她
+好
+如
+妃
+妄
+妆
+妇
+妈
+妊
+妍
+妒
+妓
+妖
+妘
+妙
+妝
+妞
+妣
+妤
+妥
+妨
+妩
+妪
+妮
+妲
+妳
+妹
+妻
+妾
+姆
+姉
+姊
+始
+姍
+姐
+姑
+姒
+姓
+委
+姗
+姚
+姜
+姝
+姣
+姥
+姦
+姨
+姪
+姫
+姬
+姹
+姻
+姿
+威
+娃
+娄
+娅
+娆
+娇
+娉
+娑
+娓
+娘
+娛
+娜
+娟
+娠
+娣
+娥
+娩
+娱
+娲
+娴
+娶
+娼
+婀
+婁
+婆
+婉
+婊
+婕
+婚
+婢
+婦
+婧
+婪
+婭
+婴
+婵
+婶
+婷
+婺
+婿
+媒
+媚
+媛
+媞
+媧
+媲
+媳
+媽
+媾
+嫁
+嫂
+嫉
+嫌
+嫑
+嫔
+嫖
+嫘
+嫚
+嫡
+嫣
+嫦
+嫩
+嫲
+嫵
+嫻
+嬅
+嬉
+嬌
+嬗
+嬛
+嬢
+嬤
+嬪
+嬰
+嬴
+嬷
+嬸
+嬿
+孀
+孃
+子
+孑
+孔
+孕
+孖
+字
+存
+孙
+孚
+孛
+孜
+孝
+孟
+孢
+季
+孤
+学
+孩
+孪
+孫
+孬
+孰
+孱
+孳
+孵
+學
+孺
+孽
+孿
+宁
+它
+宅
+宇
+守
+安
+宋
+完
+宏
+宓
+宕
+宗
+官
+宙
+定
+宛
+宜
+宝
+实
+実
+宠
+审
+客
+宣
+室
+宥
+宦
+宪
+宫
+宮
+宰
+害
+宴
+宵
+家
+宸
+容
+宽
+宾
+宿
+寂
+寄
+寅
+密
+寇
+富
+寐
+寒
+寓
+寛
+寝
+寞
+察
+寡
+寢
+寥
+實
+寧
+寨
+審
+寫
+寬
+寮
+寰
+寵
+寶
+寸
+对
+寺
+寻
+导
+対
+寿
+封
+専
+射
+将
+將
+專
+尉
+尊
+尋
+對
+導
+小
+少
+尔
+尕
+尖
+尘
+尚
+尝
+尤
+尧
+尬
+就
+尴
+尷
+尸
+尹
+尺
+尻
+尼
+尽
+尾
+尿
+局
+屁
+层
+屄
+居
+屆
+屈
+屉
+届
+屋
+屌
+屍
+屎
+屏
+屐
+屑
+展
+屜
+属
+屠
+屡
+屢
+層
+履
+屬
+屯
+山
+屹
+屿
+岀
+岁
+岂
+岌
+岐
+岑
+岔
+岖
+岗
+岘
+岙
+岚
+岛
+岡
+岩
+岫
+岬
+岭
+岱
+岳
+岷
+岸
+峇
+峋
+峒
+峙
+峡
+峤
+峥
+峦
+峨
+峪
+峭
+峯
+峰
+峴
+島
+峻
+峽
+崁
+崂
+崆
+崇
+崎
+崑
+崔
+崖
+崗
+崙
+崛
+崧
+崩
+崭
+崴
+崽
+嵇
+嵊
+嵋
+嵌
+嵐
+嵘
+嵩
+嵬
+嵯
+嶂
+嶄
+嶇
+嶋
+嶙
+嶺
+嶼
+嶽
+巅
+巍
+巒
+巔
+巖
+川
+州
+巡
+巢
+工
+左
+巧
+巨
+巩
+巫
+差
+己
+已
+巳
+巴
+巷
+巻
+巽
+巾
+巿
+币
+市
+布
+帅
+帆
+师
+希
+帐
+帑
+帕
+帖
+帘
+帚
+帛
+帜
+帝
+帥
+带
+帧
+師
+席
+帮
+帯
+帰
+帳
+帶
+帷
+常
+帼
+帽
+幀
+幂
+幄
+幅
+幌
+幔
+幕
+幟
+幡
+幢
+幣
+幫
+干
+平
+年
+并
+幸
+幹
+幺
+幻
+幼
+幽
+幾
+广
+庁
+広
+庄
+庆
+庇
+床
+序
+庐
+库
+应
+底
+庖
+店
+庙
+庚
+府
+庞
+废
+庠
+度
+座
+庫
+庭
+庵
+庶
+康
+庸
+庹
+庾
+廁
+廂
+廃
+廈
+廉
+廊
+廓
+廖
+廚
+廝
+廟
+廠
+廢
+廣
+廬
+廳
+延
+廷
+建
+廿
+开
+弁
+异
+弃
+弄
+弈
+弊
+弋
+式
+弑
+弒
+弓
+弔
+引
+弗
+弘
+弛
+弟
+张
+弥
+弦
+弧
+弩
+弭
+弯
+弱
+張
+強
+弹
+强
+弼
+弾
+彅
+彆
+彈
+彌
+彎
+归
+当
+录
+彗
+彙
+彝
+形
+彤
+彥
+彦
+彧
+彩
+彪
+彫
+彬
+彭
+彰
+影
+彷
+役
+彻
+彼
+彿
+往
+征
+径
+待
+徇
+很
+徉
+徊
+律
+後
+徐
+徑
+徒
+従
+徕
+得
+徘
+徙
+徜
+從
+徠
+御
+徨
+復
+循
+徬
+微
+徳
+徴
+徵
+德
+徹
+徼
+徽
+心
+必
+忆
+忌
+忍
+忏
+忐
+忑
+忒
+忖
+志
+忘
+忙
+応
+忠
+忡
+忤
+忧
+忪
+快
+忱
+念
+忻
+忽
+忿
+怀
+态
+怂
+怅
+怆
+怎
+怏
+怒
+怔
+怕
+怖
+怙
+怜
+思
+怠
+怡
+急
+怦
+性
+怨
+怪
+怯
+怵
+总
+怼
+恁
+恃
+恆
+恋
+恍
+恐
+恒
+恕
+恙
+恚
+恢
+恣
+恤
+恥
+恨
+恩
+恪
+恫
+恬
+恭
+息
+恰
+恳
+恵
+恶
+恸
+恺
+恻
+恼
+恿
+悄
+悅
+悉
+悌
+悍
+悔
+悖
+悚
+悟
+悠
+患
+悦
+您
+悩
+悪
+悬
+悯
+悱
+悲
+悴
+悵
+悶
+悸
+悻
+悼
+悽
+情
+惆
+惇
+惊
+惋
+惑
+惕
+惘
+惚
+惜
+惟
+惠
+惡
+惦
+惧
+惨
+惩
+惫
+惬
+惭
+惮
+惯
+惰
+惱
+想
+惴
+惶
+惹
+惺
+愁
+愆
+愈
+愉
+愍
+意
+愕
+愚
+愛
+愜
+感
+愣
+愤
+愧
+愫
+愷
+愿
+慄
+慈
+態
+慌
+慎
+慑
+慕
+慘
+慚
+慟
+慢
+慣
+慧
+慨
+慫
+慮
+慰
+慳
+慵
+慶
+慷
+慾
+憂
+憊
+憋
+憎
+憐
+憑
+憔
+憚
+憤
+憧
+憨
+憩
+憫
+憬
+憲
+憶
+憾
+懂
+懇
+懈
+應
+懊
+懋
+懑
+懒
+懦
+懲
+懵
+懶
+懷
+懸
+懺
+懼
+懾
+懿
+戀
+戈
+戊
+戌
+戍
+戎
+戏
+成
+我
+戒
+戕
+或
+战
+戚
+戛
+戟
+戡
+戦
+截
+戬
+戮
+戰
+戲
+戳
+戴
+戶
+户
+戸
+戻
+戾
+房
+所
+扁
+扇
+扈
+扉
+手
+才
+扎
+扑
+扒
+打
+扔
+払
+托
+扛
+扣
+扦
+执
+扩
+扪
+扫
+扬
+扭
+扮
+扯
+扰
+扱
+扳
+扶
+批
+扼
+找
+承
+技
+抄
+抉
+把
+抑
+抒
+抓
+投
+抖
+抗
+折
+抚
+抛
+抜
+択
+抟
+抠
+抡
+抢
+护
+报
+抨
+披
+抬
+抱
+抵
+抹
+押
+抽
+抿
+拂
+拄
+担
+拆
+拇
+拈
+拉
+拋
+拌
+拍
+拎
+拐
+拒
+拓
+拔
+拖
+拗
+拘
+拙
+拚
+招
+拜
+拟
+拡
+拢
+拣
+拥
+拦
+拧
+拨
+择
+括
+拭
+拮
+拯
+拱
+拳
+拴
+拷
+拼
+拽
+拾
+拿
+持
+挂
+指
+挈
+按
+挎
+挑
+挖
+挙
+挚
+挛
+挝
+挞
+挟
+挠
+挡
+挣
+挤
+挥
+挨
+挪
+挫
+振
+挲
+挹
+挺
+挽
+挾
+捂
+捅
+捆
+捉
+捋
+捌
+捍
+捎
+捏
+捐
+捕
+捞
+损
+捡
+换
+捣
+捧
+捨
+捩
+据
+捱
+捲
+捶
+捷
+捺
+捻
+掀
+掂
+掃
+掇
+授
+掉
+掌
+掏
+掐
+排
+掖
+掘
+掙
+掛
+掠
+採
+探
+掣
+接
+控
+推
+掩
+措
+掬
+掰
+掲
+掳
+掴
+掷
+掸
+掺
+揀
+揃
+揄
+揆
+揉
+揍
+描
+提
+插
+揖
+揚
+換
+握
+揣
+揩
+揪
+揭
+揮
+援
+揶
+揸
+揹
+揽
+搀
+搁
+搂
+搅
+損
+搏
+搐
+搓
+搔
+搖
+搗
+搜
+搞
+搡
+搪
+搬
+搭
+搵
+搶
+携
+搽
+摀
+摁
+摄
+摆
+摇
+摈
+摊
+摒
+摔
+摘
+摞
+摟
+摧
+摩
+摯
+摳
+摸
+摹
+摺
+摻
+撂
+撃
+撅
+撇
+撈
+撐
+撑
+撒
+撓
+撕
+撚
+撞
+撤
+撥
+撩
+撫
+撬
+播
+撮
+撰
+撲
+撵
+撷
+撸
+撻
+撼
+撿
+擀
+擁
+擂
+擄
+擅
+擇
+擊
+擋
+操
+擎
+擒
+擔
+擘
+據
+擞
+擠
+擡
+擢
+擦
+擬
+擰
+擱
+擲
+擴
+擷
+擺
+擼
+擾
+攀
+攏
+攒
+攔
+攘
+攙
+攜
+攝
+攞
+攢
+攣
+攤
+攥
+攪
+攫
+攬
+支
+收
+攸
+改
+攻
+放
+政
+故
+效
+敌
+敍
+敎
+敏
+救
+敕
+敖
+敗
+敘
+教
+敛
+敝
+敞
+敢
+散
+敦
+敬
+数
+敲
+整
+敵
+敷
+數
+斂
+斃
+文
+斋
+斌
+斎
+斐
+斑
+斓
+斗
+料
+斛
+斜
+斟
+斡
+斤
+斥
+斧
+斩
+斫
+斬
+断
+斯
+新
+斷
+方
+於
+施
+旁
+旃
+旅
+旋
+旌
+旎
+族
+旖
+旗
+无
+既
+日
+旦
+旧
+旨
+早
+旬
+旭
+旮
+旱
+时
+旷
+旺
+旻
+昀
+昂
+昆
+昇
+昉
+昊
+昌
+明
+昏
+易
+昔
+昕
+昙
+星
+映
+春
+昧
+昨
+昭
+是
+昱
+昴
+昵
+昶
+昼
+显
+晁
+時
+晃
+晉
+晋
+晌
+晏
+晒
+晓
+晔
+晕
+晖
+晗
+晚
+晝
+晞
+晟
+晤
+晦
+晨
+晩
+普
+景
+晰
+晴
+晶
+晷
+智
+晾
+暂
+暄
+暇
+暈
+暉
+暌
+暐
+暑
+暖
+暗
+暝
+暢
+暧
+暨
+暫
+暮
+暱
+暴
+暸
+暹
+曄
+曆
+曇
+曉
+曖
+曙
+曜
+曝
+曠
+曦
+曬
+曰
+曲
+曳
+更
+書
+曹
+曼
+曾
+替
+最
+會
+月
+有
+朋
+服
+朐
+朔
+朕
+朗
+望
+朝
+期
+朦
+朧
+木
+未
+末
+本
+札
+朮
+术
+朱
+朴
+朵
+机
+朽
+杀
+杂
+权
+杆
+杈
+杉
+李
+杏
+材
+村
+杓
+杖
+杜
+杞
+束
+杠
+条
+来
+杨
+杭
+杯
+杰
+東
+杳
+杵
+杷
+杼
+松
+板
+极
+构
+枇
+枉
+枋
+析
+枕
+林
+枚
+果
+枝
+枢
+枣
+枪
+枫
+枭
+枯
+枰
+枱
+枳
+架
+枷
+枸
+柄
+柏
+某
+柑
+柒
+染
+柔
+柘
+柚
+柜
+柞
+柠
+柢
+查
+柩
+柬
+柯
+柱
+柳
+柴
+柵
+査
+柿
+栀
+栃
+栄
+栅
+标
+栈
+栉
+栋
+栎
+栏
+树
+栓
+栖
+栗
+校
+栩
+株
+样
+核
+根
+格
+栽
+栾
+桀
+桁
+桂
+桃
+桅
+框
+案
+桉
+桌
+桎
+桐
+桑
+桓
+桔
+桜
+桠
+桡
+桢
+档
+桥
+桦
+桧
+桨
+桩
+桶
+桿
+梁
+梅
+梆
+梏
+梓
+梗
+條
+梟
+梢
+梦
+梧
+梨
+梭
+梯
+械
+梳
+梵
+梶
+检
+棂
+棄
+棉
+棋
+棍
+棒
+棕
+棗
+棘
+棚
+棟
+棠
+棣
+棧
+森
+棱
+棲
+棵
+棹
+棺
+椁
+椅
+椋
+植
+椎
+椒
+検
+椪
+椭
+椰
+椹
+椽
+椿
+楂
+楊
+楓
+楔
+楚
+楝
+楞
+楠
+楣
+楨
+楫
+業
+楮
+極
+楷
+楸
+楹
+楼
+楽
+概
+榄
+榆
+榈
+榉
+榔
+榕
+榖
+榛
+榜
+榨
+榫
+榭
+榮
+榱
+榴
+榷
+榻
+槁
+槃
+構
+槌
+槍
+槎
+槐
+槓
+様
+槛
+槟
+槤
+槭
+槲
+槳
+槻
+槽
+槿
+樁
+樂
+樊
+樑
+樓
+標
+樞
+樟
+模
+樣
+権
+横
+樫
+樯
+樱
+樵
+樸
+樹
+樺
+樽
+樾
+橄
+橇
+橋
+橐
+橘
+橙
+機
+橡
+橢
+橫
+橱
+橹
+橼
+檀
+檄
+檎
+檐
+檔
+檗
+檜
+檢
+檬
+檯
+檳
+檸
+檻
+櫃
+櫚
+櫛
+櫥
+櫸
+櫻
+欄
+權
+欒
+欖
+欠
+次
+欢
+欣
+欧
+欲
+欸
+欺
+欽
+款
+歆
+歇
+歉
+歌
+歎
+歐
+歓
+歙
+歛
+歡
+止
+正
+此
+步
+武
+歧
+歩
+歪
+歯
+歲
+歳
+歴
+歷
+歸
+歹
+死
+歼
+殁
+殃
+殆
+殇
+殉
+殊
+残
+殒
+殓
+殖
+殘
+殞
+殡
+殤
+殭
+殯
+殲
+殴
+段
+殷
+殺
+殼
+殿
+毀
+毁
+毂
+毅
+毆
+毋
+母
+毎
+每
+毒
+毓
+比
+毕
+毗
+毘
+毙
+毛
+毡
+毫
+毯
+毽
+氈
+氏
+氐
+民
+氓
+气
+氖
+気
+氙
+氛
+氟
+氡
+氢
+氣
+氤
+氦
+氧
+氨
+氪
+氫
+氮
+氯
+氰
+氲
+水
+氷
+永
+氹
+氾
+汀
+汁
+求
+汆
+汇
+汉
+汎
+汐
+汕
+汗
+汙
+汛
+汝
+汞
+江
+池
+污
+汤
+汨
+汩
+汪
+汰
+汲
+汴
+汶
+汹
+決
+汽
+汾
+沁
+沂
+沃
+沅
+沈
+沉
+沌
+沏
+沐
+沒
+沓
+沖
+沙
+沛
+沟
+没
+沢
+沣
+沥
+沦
+沧
+沪
+沫
+沭
+沮
+沱
+河
+沸
+油
+治
+沼
+沽
+沾
+沿
+況
+泄
+泉
+泊
+泌
+泓
+法
+泗
+泛
+泞
+泠
+泡
+波
+泣
+泥
+注
+泪
+泫
+泮
+泯
+泰
+泱
+泳
+泵
+泷
+泸
+泻
+泼
+泽
+泾
+洁
+洄
+洋
+洒
+洗
+洙
+洛
+洞
+津
+洩
+洪
+洮
+洱
+洲
+洵
+洶
+洸
+洹
+活
+洼
+洽
+派
+流
+浃
+浄
+浅
+浆
+浇
+浊
+测
+济
+浏
+浑
+浒
+浓
+浔
+浙
+浚
+浜
+浣
+浦
+浩
+浪
+浬
+浮
+浯
+浴
+海
+浸
+涂
+涅
+涇
+消
+涉
+涌
+涎
+涓
+涔
+涕
+涙
+涛
+涝
+涞
+涟
+涠
+涡
+涣
+涤
+润
+涧
+涨
+涩
+涪
+涮
+涯
+液
+涵
+涸
+涼
+涿
+淀
+淄
+淅
+淆
+淇
+淋
+淌
+淑
+淒
+淖
+淘
+淙
+淚
+淞
+淡
+淤
+淦
+淨
+淩
+淪
+淫
+淬
+淮
+深
+淳
+淵
+混
+淹
+淺
+添
+淼
+清
+済
+渉
+渊
+渋
+渍
+渎
+渐
+渔
+渗
+渙
+渚
+減
+渝
+渠
+渡
+渣
+渤
+渥
+渦
+温
+測
+渭
+港
+渲
+渴
+游
+渺
+渾
+湃
+湄
+湊
+湍
+湖
+湘
+湛
+湟
+湧
+湫
+湮
+湯
+湳
+湾
+湿
+満
+溃
+溅
+溉
+溏
+源
+準
+溜
+溝
+溟
+溢
+溥
+溧
+溪
+溫
+溯
+溱
+溴
+溶
+溺
+溼
+滁
+滂
+滄
+滅
+滇
+滋
+滌
+滑
+滓
+滔
+滕
+滙
+滚
+滝
+滞
+滟
+满
+滢
+滤
+滥
+滦
+滨
+滩
+滬
+滯
+滲
+滴
+滷
+滸
+滾
+滿
+漁
+漂
+漆
+漉
+漏
+漓
+演
+漕
+漠
+漢
+漣
+漩
+漪
+漫
+漬
+漯
+漱
+漲
+漳
+漸
+漾
+漿
+潆
+潇
+潋
+潍
+潑
+潔
+潘
+潛
+潜
+潞
+潟
+潢
+潤
+潦
+潧
+潭
+潮
+潰
+潴
+潸
+潺
+潼
+澀
+澄
+澆
+澈
+澍
+澎
+澗
+澜
+澡
+澤
+澧
+澱
+澳
+澹
+激
+濁
+濂
+濃
+濑
+濒
+濕
+濘
+濛
+濟
+濠
+濡
+濤
+濫
+濬
+濮
+濯
+濱
+濺
+濾
+瀅
+瀆
+瀉
+瀋
+瀏
+瀑
+瀕
+瀘
+瀚
+瀛
+瀝
+瀞
+瀟
+瀧
+瀨
+瀬
+瀰
+瀾
+灌
+灏
+灑
+灘
+灝
+灞
+灣
+火
+灬
+灭
+灯
+灰
+灵
+灶
+灸
+灼
+災
+灾
+灿
+炀
+炁
+炅
+炉
+炊
+炎
+炒
+炔
+炕
+炖
+炙
+炜
+炫
+炬
+炭
+炮
+炯
+炳
+炷
+炸
+点
+為
+炼
+炽
+烁
+烂
+烃
+烈
+烊
+烏
+烘
+烙
+烛
+烟
+烤
+烦
+烧
+烨
+烩
+烫
+烬
+热
+烯
+烷
+烹
+烽
+焉
+焊
+焕
+焖
+焗
+焘
+焙
+焚
+焜
+無
+焦
+焯
+焰
+焱
+然
+焼
+煅
+煉
+煊
+煌
+煎
+煒
+煖
+煙
+煜
+煞
+煤
+煥
+煦
+照
+煨
+煩
+煮
+煲
+煸
+煽
+熄
+熊
+熏
+熒
+熔
+熙
+熟
+熠
+熨
+熬
+熱
+熵
+熹
+熾
+燁
+燃
+燄
+燈
+燉
+燊
+燎
+燒
+燔
+燕
+燙
+燜
+營
+燥
+燦
+燧
+燭
+燮
+燴
+燻
+燼
+燿
+爆
+爍
+爐
+爛
+爪
+爬
+爭
+爰
+爱
+爲
+爵
+父
+爷
+爸
+爹
+爺
+爻
+爽
+爾
+牆
+片
+版
+牌
+牍
+牒
+牙
+牛
+牝
+牟
+牠
+牡
+牢
+牦
+牧
+物
+牯
+牲
+牴
+牵
+特
+牺
+牽
+犀
+犁
+犄
+犊
+犍
+犒
+犢
+犧
+犬
+犯
+状
+犷
+犸
+犹
+狀
+狂
+狄
+狈
+狎
+狐
+狒
+狗
+狙
+狞
+狠
+狡
+狩
+独
+狭
+狮
+狰
+狱
+狸
+狹
+狼
+狽
+猎
+猕
+猖
+猗
+猙
+猛
+猜
+猝
+猥
+猩
+猪
+猫
+猬
+献
+猴
+猶
+猷
+猾
+猿
+獄
+獅
+獎
+獐
+獒
+獗
+獠
+獣
+獨
+獭
+獰
+獲
+獵
+獷
+獸
+獺
+獻
+獼
+獾
+玄
+率
+玉
+王
+玑
+玖
+玛
+玟
+玠
+玥
+玩
+玫
+玮
+环
+现
+玲
+玳
+玷
+玺
+玻
+珀
+珂
+珅
+珈
+珉
+珊
+珍
+珏
+珐
+珑
+珙
+珞
+珠
+珣
+珥
+珩
+珪
+班
+珮
+珲
+珺
+現
+球
+琅
+理
+琇
+琉
+琊
+琍
+琏
+琐
+琛
+琢
+琥
+琦
+琨
+琪
+琬
+琮
+琰
+琲
+琳
+琴
+琵
+琶
+琺
+琼
+瑀
+瑁
+瑄
+瑋
+瑕
+瑗
+瑙
+瑚
+瑛
+瑜
+瑞
+瑟
+瑠
+瑣
+瑤
+瑩
+瑪
+瑯
+瑰
+瑶
+瑾
+璀
+璁
+璃
+璇
+璉
+璋
+璎
+璐
+璜
+璞
+璟
+璧
+璨
+環
+璽
+璿
+瓊
+瓏
+瓒
+瓜
+瓢
+瓣
+瓤
+瓦
+瓮
+瓯
+瓴
+瓶
+瓷
+甄
+甌
+甕
+甘
+甙
+甚
+甜
+生
+產
+産
+甥
+甦
+用
+甩
+甫
+甬
+甭
+甯
+田
+由
+甲
+申
+电
+男
+甸
+町
+画
+甾
+畀
+畅
+界
+畏
+畑
+畔
+留
+畜
+畝
+畢
+略
+畦
+番
+畫
+異
+畲
+畳
+畴
+當
+畸
+畹
+畿
+疆
+疇
+疊
+疏
+疑
+疔
+疖
+疗
+疙
+疚
+疝
+疟
+疡
+疣
+疤
+疥
+疫
+疮
+疯
+疱
+疲
+疳
+疵
+疸
+疹
+疼
+疽
+疾
+痂
+病
+症
+痈
+痉
+痊
+痍
+痒
+痔
+痕
+痘
+痙
+痛
+痞
+痠
+痢
+痣
+痤
+痧
+痨
+痪
+痫
+痰
+痱
+痴
+痹
+痺
+痼
+痿
+瘀
+瘁
+瘋
+瘍
+瘓
+瘘
+瘙
+瘟
+瘠
+瘡
+瘢
+瘤
+瘦
+瘧
+瘩
+瘪
+瘫
+瘴
+瘸
+瘾
+療
+癇
+癌
+癒
+癖
+癜
+癞
+癡
+癢
+癣
+癥
+癫
+癬
+癮
+癱
+癲
+癸
+発
+登
+發
+白
+百
+皂
+的
+皆
+皇
+皈
+皋
+皎
+皑
+皓
+皖
+皙
+皚
+皮
+皰
+皱
+皴
+皺
+皿
+盂
+盃
+盅
+盆
+盈
+益
+盎
+盏
+盐
+监
+盒
+盔
+盖
+盗
+盘
+盛
+盜
+盞
+盟
+盡
+監
+盤
+盥
+盧
+盪
+目
+盯
+盱
+盲
+直
+相
+盹
+盼
+盾
+省
+眈
+眉
+看
+県
+眙
+眞
+真
+眠
+眦
+眨
+眩
+眯
+眶
+眷
+眸
+眺
+眼
+眾
+着
+睁
+睇
+睏
+睐
+睑
+睛
+睜
+睞
+睡
+睢
+督
+睥
+睦
+睨
+睪
+睫
+睬
+睹
+睽
+睾
+睿
+瞄
+瞅
+瞇
+瞋
+瞌
+瞎
+瞑
+瞒
+瞓
+瞞
+瞟
+瞠
+瞥
+瞧
+瞩
+瞪
+瞬
+瞭
+瞰
+瞳
+瞻
+瞼
+瞿
+矇
+矍
+矗
+矚
+矛
+矜
+矢
+矣
+知
+矩
+矫
+短
+矮
+矯
+石
+矶
+矽
+矾
+矿
+码
+砂
+砌
+砍
+砒
+研
+砖
+砗
+砚
+砝
+砣
+砥
+砧
+砭
+砰
+砲
+破
+砷
+砸
+砺
+砼
+砾
+础
+硅
+硐
+硒
+硕
+硝
+硫
+硬
+确
+硯
+硼
+碁
+碇
+碉
+碌
+碍
+碎
+碑
+碓
+碗
+碘
+碚
+碛
+碟
+碣
+碧
+碩
+碰
+碱
+碳
+碴
+確
+碼
+碾
+磁
+磅
+磊
+磋
+磐
+磕
+磚
+磡
+磨
+磬
+磯
+磲
+磷
+磺
+礁
+礎
+礙
+礡
+礦
+礪
+礫
+礴
+示
+礼
+社
+祀
+祁
+祂
+祇
+祈
+祉
+祎
+祐
+祕
+祖
+祗
+祚
+祛
+祜
+祝
+神
+祟
+祠
+祢
+祥
+票
+祭
+祯
+祷
+祸
+祺
+祿
+禀
+禁
+禄
+禅
+禍
+禎
+福
+禛
+禦
+禧
+禪
+禮
+禱
+禹
+禺
+离
+禽
+禾
+禿
+秀
+私
+秃
+秆
+秉
+秋
+种
+科
+秒
+秘
+租
+秣
+秤
+秦
+秧
+秩
+秭
+积
+称
+秸
+移
+秽
+稀
+稅
+程
+稍
+税
+稔
+稗
+稚
+稜
+稞
+稟
+稠
+稣
+種
+稱
+稲
+稳
+稷
+稹
+稻
+稼
+稽
+稿
+穀
+穂
+穆
+穌
+積
+穎
+穗
+穢
+穩
+穫
+穴
+究
+穷
+穹
+空
+穿
+突
+窃
+窄
+窈
+窍
+窑
+窒
+窓
+窕
+窖
+窗
+窘
+窜
+窝
+窟
+窠
+窥
+窦
+窨
+窩
+窪
+窮
+窯
+窺
+窿
+竄
+竅
+竇
+竊
+立
+竖
+站
+竜
+竞
+竟
+章
+竣
+童
+竭
+端
+競
+竹
+竺
+竽
+竿
+笃
+笆
+笈
+笋
+笏
+笑
+笔
+笙
+笛
+笞
+笠
+符
+笨
+第
+笹
+笺
+笼
+筆
+等
+筊
+筋
+筍
+筏
+筐
+筑
+筒
+答
+策
+筛
+筝
+筠
+筱
+筲
+筵
+筷
+筹
+签
+简
+箇
+箋
+箍
+箏
+箐
+箔
+箕
+算
+箝
+管
+箩
+箫
+箭
+箱
+箴
+箸
+節
+篁
+範
+篆
+篇
+築
+篑
+篓
+篙
+篝
+篠
+篡
+篤
+篩
+篪
+篮
+篱
+篷
+簇
+簌
+簍
+簡
+簦
+簧
+簪
+簫
+簷
+簸
+簽
+簾
+簿
+籁
+籃
+籌
+籍
+籐
+籟
+籠
+籤
+籬
+籮
+籲
+米
+类
+籼
+籽
+粄
+粉
+粑
+粒
+粕
+粗
+粘
+粟
+粤
+粥
+粧
+粪
+粮
+粱
+粲
+粳
+粵
+粹
+粼
+粽
+精
+粿
+糅
+糊
+糍
+糕
+糖
+糗
+糙
+糜
+糞
+糟
+糠
+糧
+糬
+糯
+糰
+糸
+系
+糾
+紀
+紂
+約
+紅
+紉
+紊
+紋
+納
+紐
+紓
+純
+紗
+紘
+紙
+級
+紛
+紜
+素
+紡
+索
+紧
+紫
+紮
+累
+細
+紳
+紹
+紺
+終
+絃
+組
+絆
+経
+結
+絕
+絞
+絡
+絢
+給
+絨
+絮
+統
+絲
+絳
+絵
+絶
+絹
+綁
+綏
+綑
+經
+継
+続
+綜
+綠
+綢
+綦
+綫
+綬
+維
+綱
+網
+綴
+綵
+綸
+綺
+綻
+綽
+綾
+綿
+緊
+緋
+総
+緑
+緒
+緘
+線
+緝
+緞
+締
+緣
+編
+緩
+緬
+緯
+練
+緹
+緻
+縁
+縄
+縈
+縛
+縝
+縣
+縫
+縮
+縱
+縴
+縷
+總
+績
+繁
+繃
+繆
+繇
+繋
+織
+繕
+繚
+繞
+繡
+繩
+繪
+繫
+繭
+繳
+繹
+繼
+繽
+纂
+續
+纍
+纏
+纓
+纔
+纖
+纜
+纠
+红
+纣
+纤
+约
+级
+纨
+纪
+纫
+纬
+纭
+纯
+纰
+纱
+纲
+纳
+纵
+纶
+纷
+纸
+纹
+纺
+纽
+纾
+线
+绀
+练
+组
+绅
+细
+织
+终
+绊
+绍
+绎
+经
+绑
+绒
+结
+绔
+绕
+绘
+给
+绚
+绛
+络
+绝
+绞
+统
+绡
+绢
+绣
+绥
+绦
+继
+绩
+绪
+绫
+续
+绮
+绯
+绰
+绳
+维
+绵
+绶
+绷
+绸
+绻
+综
+绽
+绾
+绿
+缀
+缄
+缅
+缆
+缇
+缈
+缉
+缎
+缓
+缔
+缕
+编
+缘
+缙
+缚
+缜
+缝
+缠
+缢
+缤
+缥
+缨
+缩
+缪
+缭
+缮
+缰
+缱
+缴
+缸
+缺
+缽
+罂
+罄
+罌
+罐
+网
+罔
+罕
+罗
+罚
+罡
+罢
+罩
+罪
+置
+罰
+署
+罵
+罷
+罹
+羁
+羅
+羈
+羊
+羌
+美
+羔
+羚
+羞
+羟
+羡
+羣
+群
+羥
+羧
+羨
+義
+羯
+羲
+羸
+羹
+羽
+羿
+翁
+翅
+翊
+翌
+翎
+習
+翔
+翘
+翟
+翠
+翡
+翦
+翩
+翰
+翱
+翳
+翹
+翻
+翼
+耀
+老
+考
+耄
+者
+耆
+耋
+而
+耍
+耐
+耒
+耕
+耗
+耘
+耙
+耦
+耨
+耳
+耶
+耷
+耸
+耻
+耽
+耿
+聂
+聆
+聊
+聋
+职
+聒
+联
+聖
+聘
+聚
+聞
+聪
+聯
+聰
+聲
+聳
+聴
+聶
+職
+聽
+聾
+聿
+肃
+肄
+肅
+肆
+肇
+肉
+肋
+肌
+肏
+肓
+肖
+肘
+肚
+肛
+肝
+肠
+股
+肢
+肤
+肥
+肩
+肪
+肮
+肯
+肱
+育
+肴
+肺
+肽
+肾
+肿
+胀
+胁
+胃
+胄
+胆
+背
+胍
+胎
+胖
+胚
+胛
+胜
+胝
+胞
+胡
+胤
+胥
+胧
+胫
+胭
+胯
+胰
+胱
+胳
+胴
+胶
+胸
+胺
+能
+脂
+脅
+脆
+脇
+脈
+脉
+脊
+脍
+脏
+脐
+脑
+脓
+脖
+脘
+脚
+脛
+脣
+脩
+脫
+脯
+脱
+脲
+脳
+脸
+脹
+脾
+腆
+腈
+腊
+腋
+腌
+腎
+腐
+腑
+腓
+腔
+腕
+腥
+腦
+腩
+腫
+腭
+腮
+腰
+腱
+腳
+腴
+腸
+腹
+腺
+腻
+腼
+腾
+腿
+膀
+膈
+膊
+膏
+膑
+膘
+膚
+膛
+膜
+膝
+膠
+膦
+膨
+膩
+膳
+膺
+膻
+膽
+膾
+膿
+臀
+臂
+臃
+臆
+臉
+臊
+臍
+臓
+臘
+臟
+臣
+臥
+臧
+臨
+自
+臬
+臭
+至
+致
+臺
+臻
+臼
+臾
+舀
+舂
+舅
+舆
+與
+興
+舉
+舊
+舌
+舍
+舎
+舐
+舒
+舔
+舖
+舗
+舛
+舜
+舞
+舟
+航
+舫
+般
+舰
+舱
+舵
+舶
+舷
+舸
+船
+舺
+舾
+艇
+艋
+艘
+艙
+艦
+艮
+良
+艰
+艱
+色
+艳
+艷
+艹
+艺
+艾
+节
+芃
+芈
+芊
+芋
+芍
+芎
+芒
+芙
+芜
+芝
+芡
+芥
+芦
+芩
+芪
+芫
+芬
+芭
+芮
+芯
+花
+芳
+芷
+芸
+芹
+芻
+芽
+芾
+苁
+苄
+苇
+苋
+苍
+苏
+苑
+苒
+苓
+苔
+苕
+苗
+苛
+苜
+苞
+苟
+苡
+苣
+若
+苦
+苫
+苯
+英
+苷
+苹
+苻
+茁
+茂
+范
+茄
+茅
+茉
+茎
+茏
+茗
+茜
+茧
+茨
+茫
+茬
+茭
+茯
+茱
+茲
+茴
+茵
+茶
+茸
+茹
+茼
+荀
+荃
+荆
+草
+荊
+荏
+荐
+荒
+荔
+荖
+荘
+荚
+荞
+荟
+荠
+荡
+荣
+荤
+荥
+荧
+荨
+荪
+荫
+药
+荳
+荷
+荸
+荻
+荼
+荽
+莅
+莆
+莉
+莊
+莎
+莒
+莓
+莖
+莘
+莞
+莠
+莢
+莧
+莪
+莫
+莱
+莲
+莴
+获
+莹
+莺
+莽
+莿
+菀
+菁
+菅
+菇
+菈
+菊
+菌
+菏
+菓
+菖
+菘
+菜
+菟
+菠
+菡
+菩
+華
+菱
+菲
+菸
+菽
+萁
+萃
+萄
+萊
+萋
+萌
+萍
+萎
+萘
+萝
+萤
+营
+萦
+萧
+萨
+萩
+萬
+萱
+萵
+萸
+萼
+落
+葆
+葉
+著
+葚
+葛
+葡
+董
+葦
+葩
+葫
+葬
+葭
+葯
+葱
+葳
+葵
+葷
+葺
+蒂
+蒋
+蒐
+蒔
+蒙
+蒜
+蒞
+蒟
+蒡
+蒨
+蒲
+蒸
+蒹
+蒻
+蒼
+蒿
+蓁
+蓄
+蓆
+蓉
+蓋
+蓑
+蓓
+蓖
+蓝
+蓟
+蓦
+蓬
+蓮
+蓼
+蓿
+蔑
+蔓
+蔔
+蔗
+蔘
+蔚
+蔡
+蔣
+蔥
+蔫
+蔬
+蔭
+蔵
+蔷
+蔺
+蔻
+蔼
+蔽
+蕁
+蕃
+蕈
+蕉
+蕊
+蕎
+蕙
+蕤
+蕨
+蕩
+蕪
+蕭
+蕲
+蕴
+蕻
+蕾
+薄
+薅
+薇
+薈
+薊
+薏
+薑
+薔
+薙
+薛
+薦
+薨
+薩
+薪
+薬
+薯
+薰
+薹
+藉
+藍
+藏
+藐
+藓
+藕
+藜
+藝
+藤
+藥
+藩
+藹
+藻
+藿
+蘆
+蘇
+蘊
+蘋
+蘑
+蘚
+蘭
+蘸
+蘼
+蘿
+虎
+虏
+虐
+虑
+虔
+處
+虚
+虛
+虜
+虞
+號
+虢
+虧
+虫
+虬
+虱
+虹
+虻
+虽
+虾
+蚀
+蚁
+蚂
+蚊
+蚌
+蚓
+蚕
+蚜
+蚝
+蚣
+蚤
+蚩
+蚪
+蚯
+蚱
+蚵
+蛀
+蛆
+蛇
+蛊
+蛋
+蛎
+蛐
+蛔
+蛙
+蛛
+蛟
+蛤
+蛭
+蛮
+蛰
+蛳
+蛹
+蛻
+蛾
+蜀
+蜂
+蜃
+蜆
+蜇
+蜈
+蜊
+蜍
+蜒
+蜓
+蜕
+蜗
+蜘
+蜚
+蜜
+蜡
+蜢
+蜥
+蜱
+蜴
+蜷
+蜻
+蜿
+蝇
+蝈
+蝉
+蝌
+蝎
+蝕
+蝗
+蝙
+蝟
+蝠
+蝦
+蝨
+蝴
+蝶
+蝸
+蝼
+螂
+螃
+融
+螞
+螢
+螨
+螯
+螳
+螺
+蟀
+蟄
+蟆
+蟋
+蟎
+蟑
+蟒
+蟠
+蟬
+蟲
+蟹
+蟻
+蟾
+蠅
+蠍
+蠔
+蠕
+蠛
+蠟
+蠡
+蠢
+蠣
+蠱
+蠶
+蠹
+蠻
+血
+衄
+衅
+衆
+行
+衍
+術
+衔
+街
+衙
+衛
+衝
+衞
+衡
+衢
+衣
+补
+表
+衩
+衫
+衬
+衮
+衰
+衲
+衷
+衹
+衾
+衿
+袁
+袂
+袄
+袅
+袈
+袋
+袍
+袒
+袖
+袜
+袞
+袤
+袪
+被
+袭
+袱
+裁
+裂
+装
+裆
+裊
+裏
+裔
+裕
+裘
+裙
+補
+裝
+裟
+裡
+裤
+裨
+裱
+裳
+裴
+裸
+裹
+製
+裾
+褂
+複
+褐
+褒
+褓
+褔
+褚
+褥
+褪
+褫
+褲
+褶
+褻
+襁
+襄
+襟
+襠
+襪
+襬
+襯
+襲
+西
+要
+覃
+覆
+覇
+見
+規
+覓
+視
+覚
+覦
+覧
+親
+覬
+観
+覷
+覺
+覽
+觀
+见
+观
+规
+觅
+视
+览
+觉
+觊
+觎
+觐
+觑
+角
+觞
+解
+觥
+触
+觸
+言
+訂
+計
+訊
+討
+訓
+訕
+訖
+託
+記
+訛
+訝
+訟
+訣
+訥
+訪
+設
+許
+訳
+訴
+訶
+診
+註
+証
+詆
+詐
+詔
+評
+詛
+詞
+詠
+詡
+詢
+詣
+試
+詩
+詫
+詬
+詭
+詮
+詰
+話
+該
+詳
+詹
+詼
+誅
+誇
+誉
+誌
+認
+誓
+誕
+誘
+語
+誠
+誡
+誣
+誤
+誥
+誦
+誨
+說
+説
+読
+誰
+課
+誹
+誼
+調
+諄
+談
+請
+諏
+諒
+論
+諗
+諜
+諡
+諦
+諧
+諫
+諭
+諮
+諱
+諳
+諷
+諸
+諺
+諾
+謀
+謁
+謂
+謄
+謊
+謎
+謐
+謔
+謗
+謙
+講
+謝
+謠
+謨
+謬
+謹
+謾
+譁
+證
+譎
+譏
+識
+譙
+譚
+譜
+警
+譬
+譯
+議
+譲
+譴
+護
+譽
+讀
+變
+讓
+讚
+讞
+计
+订
+认
+讥
+讧
+讨
+让
+讪
+讫
+训
+议
+讯
+记
+讲
+讳
+讴
+讶
+讷
+许
+讹
+论
+讼
+讽
+设
+访
+诀
+证
+诃
+评
+诅
+识
+诈
+诉
+诊
+诋
+词
+诏
+译
+试
+诗
+诘
+诙
+诚
+诛
+话
+诞
+诟
+诠
+诡
+询
+诣
+诤
+该
+详
+诧
+诩
+诫
+诬
+语
+误
+诰
+诱
+诲
+说
+诵
+诶
+请
+诸
+诺
+读
+诽
+课
+诿
+谀
+谁
+调
+谄
+谅
+谆
+谈
+谊
+谋
+谌
+谍
+谎
+谏
+谐
+谑
+谒
+谓
+谔
+谕
+谗
+谘
+谙
+谚
+谛
+谜
+谟
+谢
+谣
+谤
+谥
+谦
+谧
+谨
+谩
+谪
+谬
+谭
+谯
+谱
+谲
+谴
+谶
+谷
+豁
+豆
+豇
+豈
+豉
+豊
+豌
+豎
+豐
+豔
+豚
+象
+豢
+豪
+豫
+豬
+豹
+豺
+貂
+貅
+貌
+貓
+貔
+貘
+貝
+貞
+負
+財
+貢
+貧
+貨
+販
+貪
+貫
+責
+貯
+貰
+貳
+貴
+貶
+買
+貸
+費
+貼
+貽
+貿
+賀
+賁
+賂
+賃
+賄
+資
+賈
+賊
+賑
+賓
+賜
+賞
+賠
+賡
+賢
+賣
+賤
+賦
+質
+賬
+賭
+賴
+賺
+購
+賽
+贅
+贈
+贊
+贍
+贏
+贓
+贖
+贛
+贝
+贞
+负
+贡
+财
+责
+贤
+败
+账
+货
+质
+贩
+贪
+贫
+贬
+购
+贮
+贯
+贰
+贱
+贲
+贴
+贵
+贷
+贸
+费
+贺
+贻
+贼
+贾
+贿
+赁
+赂
+赃
+资
+赅
+赈
+赊
+赋
+赌
+赎
+赏
+赐
+赓
+赔
+赖
+赘
+赚
+赛
+赝
+赞
+赠
+赡
+赢
+赣
+赤
+赦
+赧
+赫
+赭
+走
+赳
+赴
+赵
+赶
+起
+趁
+超
+越
+趋
+趕
+趙
+趟
+趣
+趨
+足
+趴
+趵
+趸
+趺
+趾
+跃
+跄
+跆
+跋
+跌
+跎
+跑
+跖
+跚
+跛
+距
+跟
+跡
+跤
+跨
+跩
+跪
+路
+跳
+践
+跷
+跹
+跺
+跻
+踉
+踊
+踌
+踏
+踐
+踝
+踞
+踟
+踢
+踩
+踪
+踮
+踱
+踴
+踵
+踹
+蹂
+蹄
+蹇
+蹈
+蹉
+蹊
+蹋
+蹑
+蹒
+蹙
+蹟
+蹣
+蹤
+蹦
+蹩
+蹬
+蹭
+蹲
+蹴
+蹶
+蹺
+蹼
+蹿
+躁
+躇
+躉
+躊
+躋
+躍
+躏
+躪
+身
+躬
+躯
+躲
+躺
+軀
+車
+軋
+軌
+軍
+軒
+軟
+転
+軸
+軼
+軽
+軾
+較
+載
+輒
+輓
+輔
+輕
+輛
+輝
+輟
+輩
+輪
+輯
+輸
+輻
+輾
+輿
+轄
+轅
+轆
+轉
+轍
+轎
+轟
+车
+轧
+轨
+轩
+转
+轭
+轮
+软
+轰
+轲
+轴
+轶
+轻
+轼
+载
+轿
+较
+辄
+辅
+辆
+辇
+辈
+辉
+辊
+辍
+辐
+辑
+输
+辕
+辖
+辗
+辘
+辙
+辛
+辜
+辞
+辟
+辣
+辦
+辨
+辩
+辫
+辭
+辮
+辯
+辰
+辱
+農
+边
+辺
+辻
+込
+辽
+达
+迁
+迂
+迄
+迅
+过
+迈
+迎
+运
+近
+返
+还
+这
+进
+远
+违
+连
+迟
+迢
+迤
+迥
+迦
+迩
+迪
+迫
+迭
+述
+迴
+迷
+迸
+迹
+迺
+追
+退
+送
+适
+逃
+逅
+逆
+选
+逊
+逍
+透
+逐
+递
+途
+逕
+逗
+這
+通
+逛
+逝
+逞
+速
+造
+逢
+連
+逮
+週
+進
+逵
+逶
+逸
+逻
+逼
+逾
+遁
+遂
+遅
+遇
+遊
+運
+遍
+過
+遏
+遐
+遑
+遒
+道
+達
+違
+遗
+遙
+遛
+遜
+遞
+遠
+遢
+遣
+遥
+遨
+適
+遭
+遮
+遲
+遴
+遵
+遶
+遷
+選
+遺
+遼
+遽
+避
+邀
+邁
+邂
+邃
+還
+邇
+邈
+邊
+邋
+邏
+邑
+邓
+邕
+邛
+邝
+邢
+那
+邦
+邨
+邪
+邬
+邮
+邯
+邰
+邱
+邳
+邵
+邸
+邹
+邺
+邻
+郁
+郅
+郊
+郎
+郑
+郜
+郝
+郡
+郢
+郤
+郦
+郧
+部
+郫
+郭
+郴
+郵
+郷
+郸
+都
+鄂
+鄉
+鄒
+鄔
+鄙
+鄞
+鄢
+鄧
+鄭
+鄰
+鄱
+鄲
+鄺
+酉
+酊
+酋
+酌
+配
+酐
+酒
+酗
+酚
+酝
+酢
+酣
+酥
+酩
+酪
+酬
+酮
+酯
+酰
+酱
+酵
+酶
+酷
+酸
+酿
+醃
+醇
+醉
+醋
+醍
+醐
+醒
+醚
+醛
+醜
+醞
+醣
+醪
+醫
+醬
+醮
+醯
+醴
+醺
+釀
+釁
+采
+釉
+释
+釋
+里
+重
+野
+量
+釐
+金
+釗
+釘
+釜
+針
+釣
+釦
+釧
+釵
+鈀
+鈉
+鈍
+鈎
+鈔
+鈕
+鈞
+鈣
+鈦
+鈪
+鈴
+鈺
+鈾
+鉀
+鉄
+鉅
+鉉
+鉑
+鉗
+鉚
+鉛
+鉤
+鉴
+鉻
+銀
+銃
+銅
+銑
+銓
+銖
+銘
+銜
+銬
+銭
+銮
+銳
+銷
+銹
+鋁
+鋅
+鋒
+鋤
+鋪
+鋰
+鋸
+鋼
+錄
+錐
+錘
+錚
+錠
+錢
+錦
+錨
+錫
+錮
+錯
+録
+錳
+錶
+鍊
+鍋
+鍍
+鍛
+鍥
+鍰
+鍵
+鍺
+鍾
+鎂
+鎊
+鎌
+鎏
+鎔
+鎖
+鎗
+鎚
+鎧
+鎬
+鎮
+鎳
+鏈
+鏖
+鏗
+鏘
+鏞
+鏟
+鏡
+鏢
+鏤
+鏽
+鐘
+鐮
+鐲
+鐳
+鐵
+鐸
+鐺
+鑄
+鑊
+鑑
+鑒
+鑣
+鑫
+鑰
+鑲
+鑼
+鑽
+鑾
+鑿
+针
+钉
+钊
+钎
+钏
+钒
+钓
+钗
+钙
+钛
+钜
+钝
+钞
+钟
+钠
+钡
+钢
+钣
+钤
+钥
+钦
+钧
+钨
+钩
+钮
+钯
+钰
+钱
+钳
+钴
+钵
+钺
+钻
+钼
+钾
+钿
+铀
+铁
+铂
+铃
+铄
+铅
+铆
+铉
+铎
+铐
+铛
+铜
+铝
+铠
+铡
+铢
+铣
+铤
+铨
+铩
+铬
+铭
+铮
+铰
+铲
+铵
+银
+铸
+铺
+链
+铿
+销
+锁
+锂
+锄
+锅
+锆
+锈
+锉
+锋
+锌
+锏
+锐
+锑
+错
+锚
+锟
+锡
+锢
+锣
+锤
+锥
+锦
+锭
+键
+锯
+锰
+锲
+锵
+锹
+锺
+锻
+镀
+镁
+镂
+镇
+镉
+镌
+镍
+镐
+镑
+镕
+镖
+镗
+镛
+镜
+镣
+镭
+镯
+镰
+镳
+镶
+長
+长
+門
+閃
+閉
+開
+閎
+閏
+閑
+閒
+間
+閔
+閘
+閡
+関
+閣
+閥
+閨
+閩
+閱
+閲
+閹
+閻
+閾
+闆
+闇
+闊
+闌
+闍
+闔
+闕
+闖
+闘
+關
+闡
+闢
+门
+闪
+闫
+闭
+问
+闯
+闰
+闲
+间
+闵
+闷
+闸
+闹
+闺
+闻
+闽
+闾
+阀
+阁
+阂
+阅
+阆
+阇
+阈
+阉
+阎
+阐
+阑
+阔
+阕
+阖
+阙
+阚
+阜
+队
+阡
+阪
+阮
+阱
+防
+阳
+阴
+阵
+阶
+阻
+阿
+陀
+陂
+附
+际
+陆
+陇
+陈
+陋
+陌
+降
+限
+陕
+陛
+陝
+陞
+陟
+陡
+院
+陣
+除
+陨
+险
+陪
+陰
+陲
+陳
+陵
+陶
+陷
+陸
+険
+陽
+隅
+隆
+隈
+隊
+隋
+隍
+階
+随
+隐
+隔
+隕
+隘
+隙
+際
+障
+隠
+隣
+隧
+隨
+險
+隱
+隴
+隶
+隸
+隻
+隼
+隽
+难
+雀
+雁
+雄
+雅
+集
+雇
+雉
+雋
+雌
+雍
+雎
+雏
+雑
+雒
+雕
+雖
+雙
+雛
+雜
+雞
+離
+難
+雨
+雪
+雯
+雰
+雲
+雳
+零
+雷
+雹
+電
+雾
+需
+霁
+霄
+霆
+震
+霈
+霉
+霊
+霍
+霎
+霏
+霑
+霓
+霖
+霜
+霞
+霧
+霭
+霰
+露
+霸
+霹
+霽
+霾
+靂
+靄
+靈
+青
+靓
+靖
+静
+靚
+靛
+靜
+非
+靠
+靡
+面
+靥
+靦
+革
+靳
+靴
+靶
+靼
+鞅
+鞋
+鞍
+鞏
+鞑
+鞘
+鞠
+鞣
+鞦
+鞭
+韆
+韋
+韌
+韓
+韜
+韦
+韧
+韩
+韬
+韭
+音
+韵
+韶
+韻
+響
+頁
+頂
+頃
+項
+順
+須
+頌
+預
+頑
+頒
+頓
+頗
+領
+頜
+頡
+頤
+頫
+頭
+頰
+頷
+頸
+頹
+頻
+頼
+顆
+題
+額
+顎
+顏
+顔
+願
+顛
+類
+顧
+顫
+顯
+顱
+顴
+页
+顶
+顷
+项
+顺
+须
+顼
+顽
+顾
+顿
+颁
+颂
+预
+颅
+领
+颇
+颈
+颉
+颊
+颌
+颍
+颐
+频
+颓
+颔
+颖
+颗
+题
+颚
+颛
+颜
+额
+颞
+颠
+颡
+颢
+颤
+颦
+颧
+風
+颯
+颱
+颳
+颶
+颼
+飄
+飆
+风
+飒
+飓
+飕
+飘
+飙
+飚
+飛
+飞
+食
+飢
+飨
+飩
+飪
+飯
+飲
+飼
+飽
+飾
+餃
+餅
+餉
+養
+餌
+餐
+餒
+餓
+餘
+餚
+餛
+餞
+餡
+館
+餮
+餵
+餾
+饅
+饈
+饋
+饌
+饍
+饑
+饒
+饕
+饗
+饞
+饥
+饨
+饪
+饬
+饭
+饮
+饯
+饰
+饱
+饲
+饴
+饵
+饶
+饷
+饺
+饼
+饽
+饿
+馀
+馁
+馄
+馅
+馆
+馈
+馋
+馍
+馏
+馒
+馔
+首
+馗
+香
+馥
+馨
+馬
+馭
+馮
+馳
+馴
+駁
+駄
+駅
+駆
+駐
+駒
+駕
+駛
+駝
+駭
+駱
+駿
+騁
+騎
+騏
+験
+騙
+騨
+騰
+騷
+驀
+驅
+驊
+驍
+驒
+驕
+驗
+驚
+驛
+驟
+驢
+驥
+马
+驭
+驮
+驯
+驰
+驱
+驳
+驴
+驶
+驷
+驸
+驹
+驻
+驼
+驾
+驿
+骁
+骂
+骄
+骅
+骆
+骇
+骈
+骊
+骋
+验
+骏
+骐
+骑
+骗
+骚
+骛
+骜
+骞
+骠
+骡
+骤
+骥
+骧
+骨
+骯
+骰
+骶
+骷
+骸
+骼
+髂
+髅
+髋
+髏
+髒
+髓
+體
+髖
+高
+髦
+髪
+髮
+髯
+髻
+鬃
+鬆
+鬍
+鬓
+鬚
+鬟
+鬢
+鬣
+鬥
+鬧
+鬱
+鬼
+魁
+魂
+魄
+魅
+魇
+魍
+魏
+魔
+魘
+魚
+魯
+魷
+鮑
+鮨
+鮪
+鮭
+鮮
+鯉
+鯊
+鯖
+鯛
+鯨
+鯰
+鯽
+鰍
+鰓
+鰭
+鰲
+鰻
+鰾
+鱈
+鱉
+鱔
+鱗
+鱷
+鱸
+鱼
+鱿
+鲁
+鲈
+鲍
+鲑
+鲛
+鲜
+鲟
+鲢
+鲤
+鲨
+鲫
+鲱
+鲲
+鲶
+鲷
+鲸
+鳃
+鳄
+鳅
+鳌
+鳍
+鳕
+鳖
+鳗
+鳝
+鳞
+鳥
+鳩
+鳳
+鳴
+鳶
+鴉
+鴕
+鴛
+鴦
+鴨
+鴻
+鴿
+鵑
+鵜
+鵝
+鵡
+鵬
+鵰
+鵲
+鶘
+鶩
+鶯
+鶴
+鷗
+鷲
+鷹
+鷺
+鸚
+鸞
+鸟
+鸠
+鸡
+鸢
+鸣
+鸥
+鸦
+鸨
+鸪
+鸭
+鸯
+鸳
+鸵
+鸽
+鸾
+鸿
+鹂
+鹃
+鹄
+鹅
+鹈
+鹉
+鹊
+鹌
+鹏
+鹑
+鹕
+鹘
+鹜
+鹞
+鹤
+鹦
+鹧
+鹫
+鹭
+鹰
+鹳
+鹵
+鹹
+鹼
+鹽
+鹿
+麂
+麋
+麒
+麓
+麗
+麝
+麟
+麥
+麦
+麩
+麴
+麵
+麸
+麺
+麻
+麼
+麽
+麾
+黃
+黄
+黍
+黎
+黏
+黑
+黒
+黔
+默
+黛
+黜
+黝
+點
+黠
+黨
+黯
+黴
+鼋
+鼎
+鼐
+鼓
+鼠
+鼬
+鼹
+鼻
+鼾
+齁
+齊
+齋
+齐
+齒
+齡
+齢
+齣
+齦
+齿
+龄
+龅
+龈
+龊
+龋
+龌
+龍
+龐
+龔
+龕
+龙
+龚
+龛
+龜
+龟
+︰
+︱
+︶
+︿
+﹁
+﹂
+﹍
+﹏
+﹐
+﹑
+﹒
+﹔
+﹕
+﹖
+﹗
+﹙
+﹚
+﹝
+﹞
+﹡
+﹣
+！
+＂
+＃
+＄
+％
+＆
+＇
+（
+）
+＊
+＋
+，
+－
+．
+／
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＝
+＞
+？
+＠
+［
+＼
+］
+＾
+＿
+｀
+ａ
+ｂ
+ｃ
+ｄ
+ｅ
+ｆ
+ｇ
+ｈ
+ｉ
+ｊ
+ｋ
+ｌ
+ｍ
+ｎ
+ｏ
+ｐ
+ｑ
+ｒ
+ｓ
+ｔ
+ｕ
+ｖ
+ｗ
+ｘ
+ｙ
+ｚ
+｛
+｜
+｝
+～
+｡
+｢
+｣
+､
+･
+ｯ
+ｰ
+ｲ
+ｸ
+ｼ
+ｽ
+ﾄ
+ﾉ
+ﾌ
+ﾗ
+ﾙ
+ﾝ
+ﾞ
+ﾟ
+￣
+￥
+👍
+🔥
+😂
+😎
+...
+yam
+10
+2017
+12
+11
+2016
+20
+30
+15
+06
+lofter
+##s
+2015
+by
+16
+14
+18
+13
+24
+17
+2014
+21
+##0
+22
+19
+25
+23
+com
+100
+00
+05
+2013
+##a
+03
+09
+08
+28
+##2
+50
+01
+04
+##1
+27
+02
+2012
+##3
+26
+##e
+07
+##8
+##5
+##6
+##4
+##9
+##7
+29
+2011
+40
+##t
+2010
+##o
+##d
+##i
+2009
+##n
+app
+www
+the
+##m
+31
+##c
+##l
+##y
+##r
+##g
+2008
+60
+http
+200
+qq
+##p
+80
+##f
+google
+pixnet
+90
+cookies
+tripadvisor
+500
+##er
+##k
+35
+##h
+facebook
+2007
+2000
+70
+##b
+of
+##x
+##u
+45
+300
+iphone
+32
+1000
+2006
+48
+ip
+36
+in
+38
+3d
+##w
+##ing
+55
+ctrip
+##on
+##v
+33
+##の
+to
+34
+400
+id
+2005
+it
+37
+windows
+llc
+top
+99
+42
+39
+000
+led
+at
+##an
+41
+51
+52
+46
+49
+43
+53
+44
+##z
+android
+58
+and
+59
+2004
+56
+vr
+##か
+5000
+2003
+47
+blogthis
+twitter
+54
+##le
+150
+ok
+2018
+57
+75
+cn
+no
+ios
+##in
+##mm
+##00
+800
+on
+te
+3000
+65
+2001
+360
+95
+ig
+lv
+120
+##ng
+##を
+##us
+##に
+pc
+てす
+──
+600
+##te
+85
+2002
+88
+##ed
+html
+ncc
+wifi
+email
+64
+blog
+is
+##10
+##て
+mail
+online
+##al
+dvd
+##ic
+studio
+##は
+##℃
+##ia
+##と
+line
+vip
+72
+##q
+98
+##ce
+##en
+for
+##is
+##ra
+##es
+##j
+usb
+net
+cp
+1999
+asia
+4g
+##cm
+diy
+new
+3c
+##お
+ta
+66
+language
+vs
+apple
+tw
+86
+web
+##ne
+ipad
+62
+you
+##re
+101
+68
+##tion
+ps
+de
+bt
+pony
+atm
+##2017
+1998
+67
+##ch
+ceo
+##or
+go
+##na
+av
+pro
+cafe
+96
+pinterest
+97
+63
+pixstyleme3c
+##ta
+more
+said
+##2016
+1997
+mp3
+700
+##ll
+nba
+jun
+##20
+92
+tv
+1995
+pm
+61
+76
+nbsp
+250
+##ie
+linux
+##ma
+cd
+110
+hd
+##17
+78
+##ion
+77
+6000
+am
+##th
+##st
+94
+##se
+##et
+69
+180
+gdp
+my
+105
+81
+abc
+89
+flash
+79
+one
+93
+1990
+1996
+##ck
+gps
+##も
+##ly
+web885
+106
+2020
+91
+##ge
+4000
+1500
+xd
+boss
+isbn
+1994
+org
+##ry
+me
+love
+##11
+0fork
+73
+##12
+3g
+##ter
+##ar
+71
+82
+##la
+hotel
+130
+1970
+pk
+83
+87
+140
+ie
+##os
+##30
+##el
+74
+##50
+seo
+cpu
+##ml
+p2p
+84
+may
+##る
+sun
+tue
+internet
+cc
+posted
+youtube
+##at
+##ン
+##man
+ii
+##ル
+##15
+abs
+nt
+pdf
+yahoo
+ago
+1980
+##it
+news
+mac
+104
+##てす
+##me
+##り
+java
+1992
+spa
+##de
+##nt
+hk
+all
+plus
+la
+1993
+##mb
+##16
+##ve
+west
+##da
+160
+air
+##い
+##ps
+から
+##to
+1989
+logo
+htc
+php
+https
+fi
+momo
+##son
+sat
+##ke
+##80
+ebd
+suv
+wi
+day
+apk
+##88
+##um
+mv
+galaxy
+wiki
+or
+brake
+##ス
+1200
+する
+this
+1991
+mon
+##こ
+❤2017
+po
+##ない
+javascript
+life
+home
+june
+##ss
+system
+900
+##ー
+##０
+pp
+1988
+world
+fb
+4k
+br
+##as
+ic
+ai
+leonardo
+safari
+##60
+live
+free
+xx
+wed
+win7
+kiehl
+##co
+lg
+o2o
+##go
+us
+235
+1949
+mm
+しい
+vfm
+kanye
+##90
+##2015
+##id
+jr
+##ey
+123
+rss
+##sa
+##ro
+##am
+##no
+thu
+fri
+350
+##sh
+##ki
+103
+comments
+name
+##のて
+##pe
+##ine
+max
+1987
+8000
+uber
+##mi
+##ton
+wordpress
+office
+1986
+1985
+##ment
+107
+bd
+win10
+##ld
+##li
+gmail
+bb
+dior
+##rs
+##ri
+##rd
+##ます
+up
+cad
+##®
+dr
+して
+read
+##21
+をお
+##io
+##99
+url
+1984
+pvc
+paypal
+show
+policy
+##40
+##ty
+##18
+with
+##★
+##01
+txt
+102
+##ba
+dna
+from
+post
+mini
+ar
+taiwan
+john
+##ga
+privacy
+agoda
+##13
+##ny
+word
+##24
+##22
+##by
+##ur
+##hz
+1982
+##ang
+265
+cookie
+netscape
+108
+##ka
+##～
+##ad
+house
+share
+note
+ibm
+code
+hello
+nike
+sim
+survey
+##016
+1979
+1950
+wikia
+##32
+##017
+5g
+cbc
+##tor
+##kg
+1983
+##rt
+##14
+campaign
+store
+2500
+os
+##ct
+##ts
+##°
+170
+api
+##ns
+365
+excel
+##な
+##ao
+##ら
+##し
+～～
+##nd
+university
+163
+には
+518
+##70
+##ya
+##il
+##25
+pierre
+ipo
+0020
+897
+##23
+hotels
+##ian
+のお
+125
+years
+6606
+##ers
+##26
+high
+##day
+time
+##ay
+bug
+##line
+##く
+##す
+##be
+xp
+talk2yam
+yamservice
+10000
+coco
+##dy
+sony
+##ies
+1978
+microsoft
+david
+people
+##ha
+1960
+instagram
+intel
+その
+##ot
+iso
+1981
+##va
+115
+##mo
+##land
+xxx
+man
+co
+ltxsw
+##ation
+baby
+220
+##pa
+##ol
+1945
+7000
+tag
+450
+##ue
+msn
+##31
+oppo
+##ト
+##ca
+control
+##om
+st
+chrome
+##ure
+##ん
+be
+##き
+lol
+##19
+した
+##bo
+240
+lady
+##100
+##way
+##から
+4600
+##ko
+##do
+##un
+4s
+corporation
+168
+##ni
+herme
+##28
+ｃｐ
+978
+##up
+##06
+ui
+##ds
+ppt
+admin
+three
+します
+bbc
+re
+128
+##48
+ca
+##015
+##35
+hp
+##ee
+tpp
+##た
+##ive
+××
+root
+##cc
+##ました
+##ble
+##ity
+adobe
+park
+114
+et
+oled
+city
+##ex
+##ler
+##ap
+china
+##book
+20000
+view
+##ice
+global
+##km
+your
+hong
+##mg
+out
+##ms
+ng
+ebay
+##29
+menu
+ubuntu
+##cy
+rom
+##view
+open
+ktv
+do
+server
+##lo
+if
+english
+##ね
+##５
+##oo
+1600
+##02
+step1
+kong
+club
+135
+july
+inc
+1976
+mr
+hi
+##net
+touch
+##ls
+##ii
+michael
+lcd
+##05
+##33
+phone
+james
+step2
+1300
+ios9
+##box
+dc
+##２
+##ley
+samsung
+111
+280
+pokemon
+css
+##ent
+##les
+いいえ
+##１
+s8
+atom
+play
+bmw
+##said
+sa
+etf
+ctrl
+♥yoyo♥
+##55
+2025
+##2014
+##66
+adidas
+amazon
+1958
+##ber
+##ner
+visa
+##77
+##der
+1800
+connectivity
+##hi
+firefox
+109
+118
+hr
+so
+style
+mark
+pop
+ol
+skip
+1975
+as
+##27
+##ir
+##61
+190
+mba
+##う
+##ai
+le
+##ver
+1900
+cafe2017
+lte
+super
+113
+129
+##ron
+amd
+like
+##☆
+are
+##ster
+we
+##sk
+paul
+data
+international
+##ft
+longchamp
+ssd
+good
+##ート
+##ti
+reply
+##my
+↓↓↓
+apr
+star
+##ker
+source
+136
+js
+112
+get
+force
+photo
+##one
+126
+##2013
+##ow
+link
+bbs
+1972
+goods
+##lin
+python
+119
+##ip
+game
+##ics
+##ません
+blue
+##●
+520
+##45
+page
+itunes
+##03
+1955
+260
+1968
+gt
+gif
+618
+##ff
+##47
+group
+くたさい
+about
+bar
+ganji
+##nce
+music
+lee
+not
+1977
+1971
+1973
+##per
+an
+faq
+comment
+##って
+days
+##ock
+116
+##bs
+1974
+1969
+v1
+player
+1956
+xbox
+sql
+fm
+f1
+139
+##ah
+210
+##lv
+##mp
+##000
+melody
+1957
+##３
+550
+17life
+199
+1966
+xml
+market
+##au
+##71
+999
+##04
+what
+gl
+##95
+##age
+tips
+##68
+book
+##ting
+mysql
+can
+1959
+230
+##ung
+wonderland
+watch
+10℃
+##ction
+9000
+mar
+mobile
+1946
+1962
+article
+##db
+part
+▲top
+party
+って
+1967
+1964
+1948
+##07
+##ore
+##op
+この
+dj
+##78
+##38
+010
+main
+225
+1965
+##ong
+art
+320
+ad
+134
+020
+##73
+117
+pm2
+japan
+228
+##08
+ts
+1963
+##ica
+der
+sm
+##36
+2019
+##wa
+ct
+##７
+##や
+##64
+1937
+homemesh
+search
+##85
+##れは
+##tv
+##di
+macbook
+##９
+##くたさい
+service
+##♥
+type
+った
+750
+##ier
+##si
+##75
+##います
+##ok
+best
+##ット
+goris
+lock
+##った
+cf
+3m
+big
+##ut
+ftp
+carol
+##vi
+１０
+1961
+happy
+sd
+##ac
+122
+anti
+pe
+cnn
+iii
+1920
+138
+##ラ
+1940
+esp
+jan
+tags
+##98
+##51
+august
+vol
+##86
+154
+##™
+##fs
+##れ
+##sion
+design
+ac
+##ム
+press
+jordan
+ppp
+that
+key
+check
+##６
+##tt
+##㎡
+1080p
+##lt
+power
+##42
+1952
+##bc
+vivi
+##ック
+he
+133
+121
+jpg
+##rry
+201
+175
+3500
+1947
+nb
+##ted
+##rn
+しています
+1954
+usd
+##t00
+master
+##ンク
+001
+model
+##58
+al
+##09
+1953
+##34
+ram
+goo
+ても
+##ui
+127
+1930
+red
+##ary
+rpg
+item
+##pm
+##41
+270
+##za
+project
+##2012
+hot
+td
+blogabstract
+##ger
+##62
+650
+##44
+gr2
+##します
+##ｍ
+black
+electronic
+nfc
+year
+asus
+また
+html5
+cindy
+##hd
+m3
+132
+esc
+##od
+booking
+##53
+fed
+tvb
+##81
+##ina
+mit
+165
+##いる
+chan
+192
+distribution
+next
+になる
+peter
+bios
+steam
+cm
+1941
+にも
+pk10
+##ix
+##65
+##91
+dec
+nasa
+##ana
+icecat
+00z
+b1
+will
+##46
+li
+se
+##ji
+##み
+##ard
+oct
+##ain
+jp
+##ze
+##bi
+cio
+##56
+smart
+h5
+##39
+##port
+curve
+vpn
+##nm
+##dia
+utc
+##あり
+12345678910
+##52
+rmvb
+chanel
+a4
+miss
+##and
+##im
+media
+who
+##63
+she
+girl
+5s
+124
+vera
+##して
+class
+vivo
+king
+##フ
+##ei
+national
+ab
+1951
+5cm
+888
+145
+ipod
+ap
+1100
+5mm
+211
+ms
+2756
+##69
+mp4
+msci
+##po
+##89
+131
+mg
+index
+380
+##bit
+##out
+##zz
+##97
+##67
+158
+apec
+##８
+photoshop
+opec
+￥799
+ては
+##96
+##tes
+##ast
+2g
+○○
+##ール
+￥2899
+##ling
+##よ
+##ory
+1938
+##ical
+kitty
+content
+##43
+step3
+##cn
+win8
+155
+vc
+1400
+iphone7
+robert
+##した
+tcl
+137
+beauty
+##87
+en
+dollars
+##ys
+##oc
+step
+pay
+yy
+a1
+##2011
+##lly
+##ks
+##♪
+1939
+188
+download
+1944
+sep
+exe
+ph
+います
+school
+gb
+center
+pr
+street
+##board
+uv
+##37
+##lan
+winrar
+##que
+##ua
+##com
+1942
+1936
+480
+gpu
+##４
+ettoday
+fu
+tom
+##54
+##ren
+##via
+149
+##72
+b2b
+144
+##79
+##tch
+rose
+arm
+mb
+##49
+##ial
+##nn
+nvidia
+step4
+mvp
+00㎡
+york
+156
+##イ
+how
+cpi
+591
+2765
+gov
+kg
+joe
+##xx
+mandy
+pa
+##ser
+copyright
+fashion
+1935
+don
+##け
+ecu
+##ist
+##art
+erp
+wap
+have
+##lm
+talk
+##ek
+##ning
+##if
+ch
+##ite
+video
+1943
+cs
+san
+iot
+look
+##84
+##2010
+##ku
+october
+##ux
+trump
+##hs
+##ide
+box
+141
+first
+##ins
+april
+##ight
+##83
+185
+angel
+protected
+aa
+151
+162
+x1
+m2
+##fe
+##×
+##ho
+size
+143
+min
+ofo
+fun
+gomaji
+ex
+hdmi
+food
+dns
+march
+chris
+kevin
+##のか
+##lla
+##pp
+##ec
+ag
+ems
+6s
+720p
+##rm
+##ham
+off
+##92
+asp
+team
+fandom
+ed
+299
+▌♥
+##ell
+info
+されています
+##82
+sina
+4066
+161
+##able
+##ctor
+330
+399
+315
+dll
+rights
+ltd
+idc
+jul
+3kg
+1927
+142
+ma
+surface
+##76
+##ク
+～～～
+304
+mall
+eps
+146
+green
+##59
+map
+space
+donald
+v2
+sodu
+##light
+1931
+148
+1700
+まて
+310
+reserved
+htm
+##han
+##57
+2d
+178
+mod
+##ise
+##tions
+152
+ti
+##shi
+doc
+1933
+icp
+055
+wang
+##ram
+shopping
+aug
+##pi
+##well
+now
+wam
+b2
+からお
+##hu
+236
+1928
+##gb
+266
+f2
+##93
+153
+mix
+##ef
+##uan
+bwl
+##plus
+##res
+core
+##ess
+tea
+5℃
+hktvmall
+nhk
+##ate
+list
+##ese
+301
+feb
+4m
+inn
+ての
+nov
+159
+12345
+daniel
+##ci
+pass
+##bet
+##nk
+coffee
+202
+ssl
+airbnb
+##ute
+fbi
+woshipm
+skype
+ea
+cg
+sp
+##fc
+##www
+yes
+edge
+alt
+007
+##94
+fpga
+##ght
+##gs
+iso9001
+さい
+##ile
+##wood
+##uo
+image
+lin
+icon
+american
+##em
+1932
+set
+says
+##king
+##tive
+blogger
+##74
+なと
+256
+147
+##ox
+##zy
+##red
+##ium
+##lf
+nokia
+claire
+##リ
+##ding
+november
+lohas
+##500
+##tic
+##マ
+##cs
+##ある
+##che
+##ire
+##gy
+##ult
+db
+january
+win
+##カ
+166
+road
+ptt
+##ま
+##つ
+198
+##fa
+##mer
+anna
+pchome
+はい
+udn
+ef
+420
+##time
+##tte
+2030
+##ア
+g20
+white
+かかります
+1929
+308
+garden
+eleven
+di
+##おります
+chen
+309b
+777
+172
+young
+cosplay
+ちてない
+4500
+bat
+##123
+##tra
+##ては
+kindle
+npc
+steve
+etc
+##ern
+##｜
+call
+xperia
+ces
+travel
+sk
+s7
+##ous
+1934
+##int
+みいたたけます
+183
+edu
+file
+cho
+qr
+##car
+##our
+186
+##ant
+##ｄ
+eric
+1914
+rends
+##jo
+##する
+mastercard
+##2000
+kb
+##min
+290
+##ino
+vista
+##ris
+##ud
+jack
+2400
+##set
+169
+pos
+1912
+##her
+##ou
+taipei
+しく
+205
+beta
+##ませんか
+232
+##fi
+express
+255
+body
+##ill
+aphojoy
+user
+december
+meiki
+##ick
+tweet
+richard
+##av
+##ᆫ
+iphone6
+##dd
+ちてすか
+views
+##mark
+321
+pd
+##００
+times
+##▲
+level
+##ash
+10g
+point
+5l
+##ome
+208
+koreanmall
+##ak
+george
+q2
+206
+wma
+tcp
+##200
+スタッフ
+full
+mlb
+##lle
+##watch
+tm
+run
+179
+911
+smith
+business
+##und
+1919
+color
+##tal
+222
+171
+##less
+moon
+4399
+##rl
+update
+pcb
+shop
+499
+157
+little
+なし
+end
+##mhz
+van
+dsp
+easy
+660
+##house
+##key
+history
+##ｏ
+oh
+##001
+##hy
+##web
+oem
+let
+was
+##2009
+##gg
+review
+##wan
+182
+##°c
+203
+uc
+title
+##val
+united
+233
+2021
+##ons
+doi
+trivago
+overdope
+sbs
+##ance
+##ち
+grand
+special
+573032185
+imf
+216
+wx17house
+##so
+##ーム
+audi
+##he
+london
+william
+##rp
+##ake
+science
+beach
+cfa
+amp
+ps4
+880
+##800
+##link
+##hp
+crm
+ferragamo
+bell
+make
+##eng
+195
+under
+zh
+photos
+2300
+##style
+##ント
+via
+176
+da
+##gi
+company
+i7
+##ray
+thomas
+370
+ufo
+i5
+##max
+plc
+ben
+back
+research
+8g
+173
+mike
+##pc
+##ッフ
+september
+189
+##ace
+vps
+february
+167
+pantos
+wp
+lisa
+1921
+★★
+jquery
+night
+long
+offer
+##berg
+##news
+1911
+##いて
+ray
+fks
+wto
+せます
+over
+164
+340
+##all
+##rus
+1924
+##888
+##works
+blogtitle
+loftpermalink
+##→
+187
+martin
+test
+ling
+km
+##め
+15000
+fda
+v3
+##ja
+##ロ
+ｗedding
+かある
+outlet
+family
+##ea
+をこ
+##top
+story
+##ness
+salvatore
+##lu
+204
+swift
+215
+room
+している
+oracle
+##ul
+1925
+sam
+b2c
+week
+pi
+rock
+##のは
+##ａ
+##けと
+##ean
+##300
+##gle
+cctv
+after
+chinese
+##back
+powered
+x2
+##tan
+1918
+##nes
+##イン
+canon
+only
+181
+##zi
+##las
+say
+##oe
+184
+##sd
+221
+##bot
+##world
+##zo
+sky
+made
+top100
+just
+1926
+pmi
+802
+234
+gap
+##vr
+177
+les
+174
+▲topoct
+ball
+vogue
+vi
+ing
+ofweek
+cos
+##list
+##ort
+▲topmay
+##なら
+##lon
+として
+last
+##tc
+##of
+##bus
+##gen
+real
+eva
+##コ
+a3
+nas
+##lie
+##ria
+##coin
+##bt
+▲topapr
+his
+212
+cat
+nata
+vive
+health
+⋯⋯
+drive
+sir
+▲topmar
+du
+cup
+##カー
+##ook
+##よう
+##sy
+alex
+msg
+tour
+しました
+3ce
+##word
+193
+ebooks
+r8
+block
+318
+##より
+2200
+nice
+pvp
+207
+months
+1905
+rewards
+##ther
+1917
+0800
+##xi
+##チ
+##sc
+micro
+850
+gg
+blogfp
+op
+1922
+daily
+m1
+264
+true
+##bb
+ml
+##tar
+##のお
+##ky
+anthony
+196
+253
+##yo
+state
+218
+##ara
+##aa
+##rc
+##tz
+##ston
+より
+gear
+##eo
+##ade
+ge
+see
+1923
+##win
+##ura
+ss
+heart
+##den
+##ita
+down
+##sm
+el
+png
+2100
+610
+rakuten
+whatsapp
+bay
+dream
+add
+##use
+680
+311
+pad
+gucci
+mpv
+##ode
+##fo
+island
+▲topjun
+##▼
+223
+jason
+214
+chicago
+##❤
+しの
+##hone
+io
+##れる
+##ことか
+sogo
+be2
+##ology
+990
+cloud
+vcd
+##con
+2～3
+##ford
+##joy
+##kb
+##こさいます
+##rade
+but
+##ach
+docker
+##ful
+rfid
+ul
+##ase
+hit
+ford
+##star
+580
+##○
+１１
+a2
+sdk
+reading
+edited
+##are
+cmos
+##mc
+238
+siri
+light
+##ella
+##ため
+bloomberg
+##read
+pizza
+##ison
+jimmy
+##vm
+college
+node
+journal
+ba
+18k
+##play
+245
+##cer
+２０
+magic
+##yu
+191
+jump
+288
+tt
+##ings
+asr
+##lia
+3200
+step5
+network
+##cd
+mc
+いします
+1234
+pixstyleme
+273
+##600
+2800
+money
+★★★★★
+1280
+１２
+430
+bl
+みの
+act
+##tus
+tokyo
+##rial
+##life
+emba
+##ae
+saas
+tcs
+##rk
+##wang
+summer
+##sp
+ko
+##ving
+390
+premium
+##その
+netflix
+##ヒ
+uk
+mt
+##lton
+right
+frank
+two
+209
+える
+##ple
+##cal
+021
+##んな
+##sen
+##ville
+hold
+nexus
+dd
+##ius
+てお
+##mah
+##なく
+tila
+zero
+820
+ce
+##tin
+resort
+##ws
+charles
+old
+p10
+5d
+report
+##360
+##ru
+##には
+bus
+vans
+lt
+##est
+pv
+##レ
+links
+rebecca
+##ツ
+##dm
+azure
+##365
+きな
+limited
+bit
+4gb
+##mon
+1910
+moto
+##eam
+213
+1913
+var
+eos
+なとの
+226
+blogspot
+された
+699
+e3
+dos
+dm
+fc
+##ments
+##ik
+##kw
+boy
+##bin
+##ata
+960
+er
+##せ
+219
+##vin
+##tu
+##ula
+194
+##∥
+station
+##ろ
+##ature
+835
+files
+zara
+hdr
+top10
+nature
+950
+magazine
+s6
+marriott
+##シ
+avira
+case
+##っと
+tab
+##ran
+tony
+##home
+oculus
+im
+##ral
+jean
+saint
+cry
+307
+rosie
+##force
+##ini
+ice
+##bert
+のある
+##nder
+##mber
+pet
+2600
+##◆
+plurk
+▲topdec
+##sis
+00kg
+▲topnov
+720
+##ence
+tim
+##ω
+##nc
+##ても
+##name
+log
+ips
+great
+ikea
+malaysia
+unix
+##イト
+3600
+##ncy
+##nie
+12000
+akb48
+##ye
+##oid
+404
+##chi
+##いた
+oa
+xuehai
+##1000
+##orm
+##rf
+275
+さん
+##ware
+##リー
+980
+ho
+##pro
+text
+##era
+560
+bob
+227
+##ub
+##2008
+8891
+scp
+avi
+##zen
+2022
+mi
+wu
+museum
+qvod
+apache
+lake
+jcb
+▲topaug
+★★★
+ni
+##hr
+hill
+302
+ne
+weibo
+490
+ruby
+##ーシ
+##ヶ
+##row
+4d
+▲topjul
+iv
+##ish
+github
+306
+mate
+312
+##スト
+##lot
+##ane
+andrew
+のハイト
+##tina
+t1
+rf
+ed2k
+##vel
+##900
+way
+final
+りの
+ns
+5a
+705
+197
+##メ
+sweet
+bytes
+##ene
+▲topjan
+231
+##cker
+##2007
+##px
+100g
+topapp
+229
+helpapp
+rs
+low
+14k
+g4g
+care
+630
+ldquo
+あり
+##fork
+leave
+rm
+edition
+##gan
+##zon
+##qq
+▲topsep
+##google
+##ism
+gold
+224
+explorer
+##zer
+toyota
+category
+select
+visual
+##labels
+restaurant
+##md
+posts
+s1
+##ico
+もっと
+angelababy
+123456
+217
+sports
+s3
+mbc
+1915
+してくたさい
+shell
+x86
+candy
+##new
+kbs
+face
+xl
+470
+##here
+4a
+swissinfo
+v8
+▲topfeb
+dram
+##ual
+##vice
+3a
+##wer
+sport
+q1
+ios10
+public
+int
+card
+##ｃ
+ep
+au
+rt
+##れた
+1080
+bill
+##mll
+kim
+３０
+460
+wan
+##uk
+##ミ
+x3
+298
+0t
+scott
+##ming
+239
+e5
+##3d
+h7n9
+worldcat
+brown
+##あります
+##vo
+##led
+##580
+##ax
+249
+410
+##ert
+paris
+##～6
+polo
+925
+##lr
+599
+##ナ
+capital
+##hing
+bank
+cv
+1g
+##chat
+##ｓ
+##たい
+adc
+##ule
+2m
+##ｅ
+digital
+hotmail
+268
+##pad
+870
+bbq
+quot
+##ring
+before
+wali
+##まて
+mcu
+2k
+2b
+という
+costco
+316
+north
+333
+switch
+##city
+##ｐ
+philips
+##mann
+management
+panasonic
+##cl
+##vd
+##ping
+##rge
+alice
+##lk
+##ましょう
+css3
+##ney
+vision
+alpha
+##ular
+##400
+##tter
+lz
+にお
+##ありません
+mode
+gre
+1916
+pci
+##tm
+237
+1～2
+##yan
+##そ
+について
+##let
+##キ
+work
+war
+coach
+ah
+mary
+##ᅵ
+huang
+##pt
+a8
+pt
+follow
+##berry
+1895
+##ew
+a5
+ghost
+##ション
+##wn
+##og
+south
+##code
+girls
+##rid
+action
+villa
+git
+r11
+table
+games
+##cket
+error
+##anonymoussaid
+##ag
+here
+##ame
+##gc
+qa
+##■
+##lis
+gmp
+##gin
+vmalife
+##cher
+yu
+wedding
+##tis
+demo
+dragon
+530
+soho
+social
+bye
+##rant
+river
+orz
+acer
+325
+##↑
+##ース
+##ats
+261
+del
+##ven
+440
+ups
+##ように
+##ター
+305
+value
+macd
+yougou
+##dn
+661
+##ano
+ll
+##urt
+##rent
+continue
+script
+##wen
+##ect
+paper
+263
+319
+shift
+##chel
+##フト
+##cat
+258
+x5
+fox
+243
+##さん
+car
+aaa
+##blog
+loading
+##yn
+##tp
+kuso
+799
+si
+sns
+イカせるテンマ
+ヒンクテンマ3
+rmb
+vdc
+forest
+central
+prime
+help
+ultra
+##rmb
+##ような
+241
+square
+688
+##しい
+のないフロクに
+##field
+##reen
+##ors
+##ju
+c1
+start
+510
+##air
+##map
+cdn
+##wo
+cba
+stephen
+m8
+100km
+##get
+opera
+##base
+##ood
+vsa
+com™
+##aw
+##ail
+251
+なのて
+count
+t2
+##ᅡ
+##een
+2700
+hop
+##gp
+vsc
+tree
+##eg
+##ose
+816
+285
+##ories
+##shop
+alphago
+v4
+1909
+simon
+##ᆼ
+fluke62max
+zip
+スホンサー
+##sta
+louis
+cr
+bas
+##～10
+bc
+##yer
+hadoop
+##ube
+##wi
+1906
+0755
+hola
+##low
+place
+centre
+5v
+d3
+##fer
+252
+##750
+##media
+281
+540
+0l
+exchange
+262
+series
+##ハー
+##san
+eb
+##bank
+##ｋ
+q3
+##nge
+##mail
+take
+##lp
+259
+1888
+client
+east
+cache
+event
+vincent
+##ールを
+きを
+##nse
+sui
+855
+adchoice
+##и
+##stry
+##なたの
+246
+##zone
+ga
+apps
+sea
+##ab
+248
+cisco
+##タ
+##rner
+kymco
+##care
+dha
+##pu
+##yi
+minkoff
+royal
+p1
+への
+annie
+269
+collection
+kpi
+playstation
+257
+になります
+866
+bh
+##bar
+queen
+505
+radio
+1904
+andy
+armani
+##xy
+manager
+iherb
+##ery
+##share
+spring
+raid
+johnson
+1908
+##ob
+volvo
+hall
+##ball
+v6
+our
+taylor
+##hk
+bi
+242
+##cp
+kate
+bo
+water
+technology
+##rie
+サイトは
+277
+##ona
+##sl
+hpv
+303
+gtx
+hip
+rdquo
+jayz
+stone
+##lex
+##rum
+namespace
+##やり
+620
+##ale
+##atic
+des
+##erson
+##ql
+##ves
+##type
+enter
+##この
+##てきます
+d2
+##168
+##mix
+##bian
+との
+a9
+jj
+ky
+##lc
+access
+movie
+##hc
+リストに
+tower
+##ration
+##mit
+ます
+##nch
+ua
+tel
+prefix
+##o2
+1907
+##point
+1901
+ott
+～10
+##http
+##ury
+baidu
+##ink
+member
+##logy
+bigbang
+nownews
+##js
+##shot
+##tb
+##こと
+247
+eba
+##tics
+##lus
+ける
+v5
+spark
+##ama
+there
+##ions
+god
+##lls
+##down
+hiv
+##ress
+burberry
+day2
+##kv
+◆◆
+jeff
+related
+film
+edit
+joseph
+283
+##ark
+cx
+32gb
+order
+g9
+30000
+##ans
+##tty
+s5
+##bee
+かあります
+thread
+xr
+buy
+sh
+005
+land
+spotify
+mx
+##ari
+276
+##verse
+×email
+sf
+why
+##ことて
+244
+7headlines
+nego
+sunny
+dom
+exo
+401
+666
+positioning
+fit
+rgb
+##tton
+278
+kiss
+alexa
+adam
+lp
+みリストを
+##ｇ
+mp
+##ties
+##llow
+amy
+##du
+np
+002
+institute
+271
+##rth
+##lar
+2345
+590
+##des
+sidebar
+１５
+imax
+site
+##cky
+##kit
+##ime
+##009
+season
+323
+##fun
+##ンター
+##ひ
+gogoro
+a7
+pu
+lily
+fire
+twd600
+##ッセーシを
+いて
+##vis
+30ml
+##cture
+##をお
+information
+##オ
+close
+friday
+##くれる
+yi
+nick
+てすか
+##tta
+##tel
+6500
+##lock
+cbd
+economy
+254
+かお
+267
+tinker
+double
+375
+8gb
+voice
+##app
+oops
+channel
+today
+985
+##right
+raw
+xyz
+##＋
+jim
+edm
+##cent
+7500
+supreme
+814
+ds
+##its
+##asia
+dropbox
+##てすか
+##tti
+books
+272
+100ml
+##tle
+##ller
+##ken
+##more
+##boy
+sex
+309
+##dom
+t3
+##ider
+##なります
+##unch
+1903
+810
+feel
+5500
+##かった
+##put
+により
+s2
+mo
+##gh
+men
+ka
+amoled
+div
+##tr
+##n1
+port
+howard
+##tags
+ken
+dnf
+##nus
+adsense
+##а
+ide
+##へ
+buff
+thunder
+##town
+##ique
+has
+##body
+auto
+pin
+##erry
+tee
+てした
+295
+number
+##the
+##013
+object
+psp
+cool
+udnbkk
+16gb
+##mic
+miui
+##tro
+most
+r2
+##alk
+##nity
+1880
+±0
+##いました
+428
+s4
+law
+version
+##oa
+n1
+sgs
+docomo
+##tf
+##ack
+henry
+fc2
+##ded
+##sco
+##014
+##rite
+286
+0mm
+linkedin
+##ada
+##now
+wii
+##ndy
+ucbug
+##◎
+sputniknews
+legalminer
+##ika
+##xp
+2gb
+##bu
+q10
+oo
+b6
+come
+##rman
+cheese
+ming
+maker
+##gm
+nikon
+##fig
+ppi
+kelly
+##ります
+jchere
+てきます
+ted
+md
+003
+fgo
+tech
+##tto
+dan
+soc
+##gl
+##len
+hair
+earth
+640
+521
+img
+##pper
+##a1
+##てきる
+##ロク
+acca
+##ition
+##ference
+suite
+##ig
+outlook
+##mond
+##cation
+398
+##pr
+279
+101vip
+358
+##999
+282
+64gb
+3800
+345
+airport
+##over
+284
+##おり
+jones
+##ith
+lab
+##su
+##いるのて
+co2
+town
+piece
+##llo
+no1
+vmware
+24h
+##qi
+focus
+reader
+##admin
+##ora
+tb
+false
+##log
+1898
+know
+lan
+838
+##ces
+f4
+##ume
+motel
+stop
+##oper
+na
+flickr
+netcomponents
+##af
+##─
+pose
+williams
+local
+##ound
+##cg
+##site
+##iko
+いお
+274
+5m
+gsm
+con
+##ath
+1902
+friends
+##hip
+cell
+317
+##rey
+780
+cream
+##cks
+012
+##dp
+facebooktwitterpinterestgoogle
+sso
+324
+shtml
+song
+swiss
+##mw
+##キンク
+lumia
+xdd
+string
+tiffany
+522
+marc
+られた
+insee
+russell
+sc
+dell
+##ations
+ｏｋ
+camera
+289
+##vs
+##flow
+##late
+classic
+287
+##nter
+stay
+g1
+mtv
+512
+##ever
+##lab
+##nger
+qe
+sata
+ryan
+d1
+50ml
+cms
+##cing
+su
+292
+3300
+editor
+296
+##nap
+security
+sunday
+association
+##ens
+##700
+##bra
+acg
+##かり
+sofascore
+とは
+mkv
+##ign
+jonathan
+gary
+build
+labels
+##oto
+tesla
+moba
+qi
+gohappy
+general
+ajax
+1024
+##かる
+サイト
+society
+##test
+##urs
+wps
+fedora
+##ich
+mozilla
+328
+##480
+##dr
+usa
+urn
+##lina
+##ｒ
+grace
+##die
+##try
+##ader
+1250
+##なり
+elle
+570
+##chen
+##ᆯ
+price
+##ten
+uhz
+##ough
+eq
+##hen
+states
+push
+session
+balance
+wow
+506
+##cus
+##py
+when
+##ward
+##ep
+34e
+wong
+library
+prada
+##サイト
+##cle
+running
+##ree
+313
+ck
+date
+q4
+##ctive
+##ool
+##＞
+mk
+##ira
+##163
+388
+die
+secret
+rq
+dota
+buffet
+は１ヶ
+e6
+##ez
+pan
+368
+ha
+##card
+##cha
+2a
+##さ
+alan
+day3
+eye
+f3
+##end
+france
+keep
+adi
+rna
+tvbs
+##ala
+solo
+nova
+##え
+##tail
+##ょう
+support
+##ries
+##なる
+##ved
+base
+copy
+iis
+fps
+##ways
+hero
+hgih
+profile
+fish
+mu
+ssh
+entertainment
+chang
+##wd
+click
+cake
+##ond
+pre
+##tom
+kic
+pixel
+##ov
+##fl
+product
+6a
+##pd
+dear
+##gate
+es
+yumi
+audio
+##²
+##sky
+echo
+bin
+where
+##ture
+329
+##ape
+find
+sap
+isis
+##なと
+nand
+##101
+##load
+##ream
+band
+a6
+525
+never
+##post
+festival
+50cm
+##we
+555
+guide
+314
+zenfone
+##ike
+335
+gd
+forum
+jessica
+strong
+alexander
+##ould
+software
+allen
+##ious
+program
+360°
+else
+lohasthree
+##gar
+することかてきます
+please
+##れます
+rc
+##ggle
+##ric
+bim
+50000
+##own
+eclipse
+355
+brian
+3ds
+##side
+061
+361
+##other
+##ける
+##tech
+##ator
+485
+engine
+##ged
+##ｔ
+plaza
+##fit
+cia
+ngo
+westbrook
+shi
+tbs
+50mm
+##みませんか
+sci
+291
+reuters
+##ily
+contextlink
+##hn
+af
+##cil
+bridge
+very
+##cel
+1890
+cambridge
+##ize
+15g
+##aid
+##data
+790
+frm
+##head
+award
+butler
+##sun
+meta
+##mar
+america
+ps3
+puma
+pmid
+##すか
+lc
+670
+kitchen
+##lic
+オーフン5
+きなしソフトサーヒス
+そして
+day1
+future
+★★★★
+##text
+##page
+##rris
+pm1
+##ket
+fans
+##っています
+1001
+christian
+bot
+kids
+trackback
+##hai
+c3
+display
+##hl
+n2
+1896
+idea
+さんも
+##sent
+airmail
+##ug
+##men
+pwm
+けます
+028
+##lution
+369
+852
+awards
+schemas
+354
+asics
+wikipedia
+font
+##tional
+##vy
+c2
+293
+##れている
+##dget
+##ein
+っている
+contact
+pepper
+スキル
+339
+##～5
+294
+##uel
+##ument
+730
+##hang
+みてす
+q5
+##sue
+rain
+##ndi
+wei
+swatch
+##cept
+わせ
+331
+popular
+##ste
+##tag
+p2
+501
+trc
+1899
+##west
+##live
+justin
+honda
+ping
+messenger
+##rap
+v9
+543
+##とは
+unity
+appqq
+はすへて
+025
+leo
+##tone
+##テ
+##ass
+uniqlo
+##010
+502
+her
+jane
+memory
+moneydj
+##tical
+human
+12306
+していると
+##m2
+coc
+miacare
+##mn
+tmt
+##core
+vim
+kk
+##may
+fan
+target
+use
+too
+338
+435
+2050
+867
+737
+fast
+##2c
+services
+##ope
+omega
+energy
+##わ
+pinkoi
+1a
+##なから
+##rain
+jackson
+##ement
+##シャンルの
+374
+366
+そんな
+p9
+rd
+##ᆨ
+1111
+##tier
+##vic
+zone
+##│
+385
+690
+dl
+isofix
+cpa
+m4
+322
+kimi
+めて
+davis
+##lay
+lulu
+##uck
+050
+weeks
+qs
+##hop
+920
+##ｎ
+ae
+##ear
+～5
+eia
+405
+##fly
+korea
+jpeg
+boost
+##ship
+small
+##リア
+1860
+eur
+297
+425
+valley
+##iel
+simple
+##ude
+rn
+k2
+##ena
+されます
+non
+patrick
+しているから
+##ナー
+feed
+5757
+30g
+process
+well
+qqmei
+##thing
+they
+aws
+lu
+pink
+##ters
+##kin
+または
+board
+##vertisement
+wine
+##ien
+unicode
+##dge
+r1
+359
+##tant
+いを
+##twitter
+##3c
+cool1
+される
+##れて
+##ｌ
+isp
+##012
+standard
+45㎡2
+402
+##150
+matt
+##fu
+326
+##iner
+googlemsn
+pixnetfacebookyahoo
+##ラン
+x7
+886
+##uce
+メーカー
+sao
+##ev
+##きました
+##file
+9678
+403
+xddd
+shirt
+6l
+##rio
+##hat
+3mm
+givenchy
+ya
+bang
+##lio
+monday
+crystal
+ロクイン
+##abc
+336
+head
+890
+ubuntuforumwikilinuxpastechat
+##vc
+##～20
+##rity
+cnc
+7866
+ipv6
+null
+1897
+##ost
+yang
+imsean
+tiger
+##fet
+##ンス
+352
+##＝
+dji
+327
+ji
+maria
+##come
+##んて
+foundation
+3100
+##beth
+##なった
+1m
+601
+active
+##aft
+##don
+3p
+sr
+349
+emma
+##khz
+living
+415
+353
+1889
+341
+709
+457
+sas
+x6
+##face
+pptv
+x4
+##mate
+han
+sophie
+##jing
+337
+fifa
+##mand
+other
+sale
+inwedding
+##gn
+てきちゃいます
+##mmy
+##pmlast
+bad
+nana
+nbc
+してみてくたさいね
+なとはお
+##wu
+##かあります
+##あ
+note7
+single
+##340
+せからこ
+してくたさい♪この
+しにはとんとんワークケートを
+するとあなたにもっとマッチした
+ならワークケートへ
+もみつかっちゃうかも
+ワークケートの
+##bel
+window
+##dio
+##ht
+union
+age
+382
+１４
+##ivity
+##ｙ
+コメント
+domain
+neo
+##isa
+##lter
+5k
+f5
+steven
+##cts
+powerpoint
+tft
+self
+g2
+ft
+##テル
+zol
+##act
+mwc
+381
+343
+もう
+nbapop
+408
+てある
+eds
+ace
+##room
+previous
+author
+tomtom
+il
+##ets
+hu
+financial
+☆☆☆
+っています
+bp
+5t
+chi
+1gb
+##hg
+fairmont
+cross
+008
+gay
+h2
+function
+##けて
+356
+also
+1b
+625
+##ータ
+##raph
+1894
+3～5
+##ils
+i3
+334
+avenue
+##host
+による
+##bon
+##tsu
+message
+navigation
+50g
+fintech
+h6
+##ことを
+8cm
+##ject
+##vas
+##firm
+credit
+##wf
+xxxx
+form
+##nor
+##space
+huawei
+plan
+json
+sbl
+##dc
+machine
+921
+392
+wish
+##120
+##sol
+windows7
+edward
+##ために
+development
+washington
+##nsis
+lo
+818
+##sio
+##ym
+##bor
+planet
+##～8
+##wt
+ieee
+gpa
+##めて
+camp
+ann
+gm
+##tw
+##oka
+connect
+##rss
+##work
+##atus
+wall
+chicken
+soul
+2mm
+##times
+fa
+##ather
+##cord
+009
+##eep
+hitachi
+gui
+harry
+##pan
+e1
+disney
+##press
+##ーション
+wind
+386
+frigidaire
+##tl
+liu
+hsu
+332
+basic
+von
+ev
+いた
+てきる
+スホンサーサイト
+learning
+##ull
+expedia
+archives
+change
+##wei
+santa
+cut
+ins
+6gb
+turbo
+brand
+cf1
+508
+004
+return
+747
+##rip
+h1
+##nis
+##をこ
+128gb
+##にお
+3t
+application
+しており
+emc
+rx
+##oon
+384
+quick
+412
+15058
+wilson
+wing
+chapter
+##bug
+beyond
+##cms
+##dar
+##oh
+zoom
+e2
+trip
+sb
+##nba
+rcep
+342
+aspx
+ci
+080
+gc
+gnu
+める
+##count
+advanced
+dance
+dv
+##url
+##ging
+367
+8591
+am09
+shadow
+battle
+346
+##ｉ
+##cia
+##という
+emily
+##のてす
+##tation
+host
+ff
+techorz
+sars
+##mini
+##mporary
+##ering
+nc
+4200
+798
+##next
+cma
+##mbps
+##gas
+##ift
+##dot
+##ィ
+455
+##～17
+amana
+##りの
+426
+##ros
+ir
+00㎡1
+##eet
+##ible
+##↓
+710
+ˋ▽ˊ
+##aka
+dcs
+iq
+##ｖ
+l1
+##lor
+maggie
+##011
+##iu
+588
+##～1
+830
+##gt
+1tb
+articles
+create
+##burg
+##iki
+database
+fantasy
+##rex
+##cam
+dlc
+dean
+##you
+hard
+path
+gaming
+victoria
+maps
+cb
+##lee
+##itor
+overchicstoretvhome
+systems
+##xt
+416
+p3
+sarah
+760
+##nan
+407
+486
+x9
+install
+second
+626
+##ann
+##ph
+##rcle
+##nic
+860
+##nar
+ec
+##とう
+768
+metro
+chocolate
+##rian
+～4
+##table
+##しています
+skin
+##sn
+395
+mountain
+##0mm
+inparadise
+6m
+7x24
+ib
+4800
+##jia
+eeworld
+creative
+g5
+g3
+357
+parker
+ecfa
+village
+からの
+18000
+sylvia
+サーヒス
+hbl
+##ques
+##onsored
+##x2
+##きます
+##v4
+##tein
+ie6
+383
+##stack
+389
+ver
+##ads
+##baby
+sound
+bbe
+##110
+##lone
+##uid
+ads
+022
+gundam
+351
+thinkpad
+006
+scrum
+match
+##ave
+mems
+##470
+##oy
+##なりました
+##talk
+glass
+lamigo
+span
+##eme
+job
+##a5
+jay
+wade
+kde
+498
+##lace
+ocean
+tvg
+##covery
+##r3
+##ners
+##rea
+junior
+think
+##aine
+cover
+##ision
+##sia
+↓↓
+##bow
+msi
+413
+458
+406
+##love
+711
+801
+soft
+z2
+##pl
+456
+1840
+mobil
+mind
+##uy
+427
+nginx
+##oi
+めた
+##rr
+6221
+##mple
+##sson
+##ーシてす
+371
+##nts
+91tv
+comhd
+crv3000
+##uard
+1868
+397
+deep
+lost
+field
+gallery
+##bia
+rate
+spf
+redis
+traction
+930
+icloud
+011
+なら
+fe
+jose
+372
+##tory
+into
+sohu
+fx
+899
+379
+kicstart2
+##hia
+すく
+##～3
+##sit
+ra
+２４
+##walk
+##xure
+500g
+##pact
+pacific
+xa
+natural
+carlo
+##250
+##walker
+1850
+##can
+cto
+gigi
+516
+##サー
+pen
+##hoo
+ob
+matlab
+##ｂ
+##yy
+13913459
+##iti
+mango
+##bbs
+sense
+c5
+oxford
+##ニア
+walker
+jennifer
+##ola
+course
+##bre
+701
+##pus
+##rder
+lucky
+075
+##ぁ
+ivy
+なお
+##nia
+sotheby
+side
+##ugh
+joy
+##orage
+##ush
+##bat
+##dt
+364
+r9
+##2d
+##gio
+511
+country
+wear
+##lax
+##～7
+##moon
+393
+seven
+study
+411
+348
+lonzo
+8k
+##ェ
+evolution
+##イフ
+##kk
+gs
+kd
+##レス
+arduino
+344
+b12
+##lux
+arpg
+##rdon
+cook
+##x5
+dark
+five
+##als
+##ida
+とても
+sign
+362
+##ちの
+something
+20mm
+##nda
+387
+##posted
+fresh
+tf
+1870
+422
+cam
+##mine
+##skip
+##form
+##ssion
+education
+394
+##tee
+dyson
+stage
+##jie
+want
+##night
+epson
+pack
+あります
+##ppy
+テリヘル
+##█
+wd
+##eh
+##rence
+left
+##lvin
+golden
+mhz
+discovery
+##trix
+##n2
+loft
+##uch
+##dra
+##sse
+speed
+～1
+1mdb
+sorry
+welcome
+##urn
+wave
+gaga
+##lmer
+teddy
+##160
+トラックハック
+せよ
+611
+##f2016
+378
+rp
+##sha
+rar
+##あなたに
+##きた
+840
+holiday
+##ュー
+373
+074
+##vg
+##nos
+##rail
+gartner
+gi
+6p
+##dium
+kit
+488
+b3
+eco
+##ろう
+20g
+sean
+##stone
+autocad
+nu
+##np
+f16
+write
+029
+m5
+##ias
+images
+atp
+##dk
+fsm
+504
+1350
+ve
+52kb
+##xxx
+##のに
+##cake
+414
+unit
+lim
+ru
+1v
+##ification
+published
+angela
+16g
+analytics
+ak
+##ｑ
+##nel
+gmt
+##icon
+again
+##₂
+##bby
+ios11
+445
+かこさいます
+waze
+いてす
+##ハ
+9985
+##ust
+##ティー
+framework
+##007
+iptv
+delete
+52sykb
+cl
+wwdc
+027
+30cm
+##fw
+##ての
+1389
+##xon
+brandt
+##ses
+##dragon
+tc
+vetements
+anne
+monte
+modern
+official
+##へて
+##ere
+##nne
+##oud
+もちろん
+５０
+etnews
+##a2
+##graphy
+421
+863
+##ちゃん
+444
+##rtex
+##てお
+l2
+##gma
+mount
+ccd
+たと
+archive
+morning
+tan
+ddos
+e7
+##ホ
+day4
+##ウ
+gis
+453
+its
+495
+factory
+bruce
+pg
+##ito
+ってくたさい
+guest
+cdma
+##lling
+536
+n3
+しかし
+3～4
+mega
+eyes
+ro
+１３
+women
+dac
+church
+##jun
+singapore
+##facebook
+6991
+starbucks
+##tos
+##stin
+##shine
+zen
+##mu
+tina
+20℃
+1893
+##たけて
+503
+465
+request
+##gence
+qt
+##っ
+1886
+347
+363
+q7
+##zzi
+diary
+##tore
+409
+##ead
+468
+cst
+##osa
+canada
+agent
+va
+##jiang
+##ちは
+##ーク
+##lam
+sg
+##nix
+##sday
+##よって
+g6
+##master
+bing
+##zl
+charlie
+１６
+8mm
+nb40
+##ーン
+thai
+##ルフ
+ln284ct
+##itz
+##2f
+bonnie
+##food
+##lent
+originals
+##stro
+##lts
+418
+∟∣
+##bscribe
+children
+ntd
+yesstyle
+##かも
+hmv
+##tment
+d5
+2cm
+arts
+sms
+##pn
+##я
+##いい
+topios9
+539
+lifestyle
+virtual
+##ague
+xz
+##deo
+muji
+024
+unt
+##nnis
+##ᅩ
+faq1
+1884
+396
+##ette
+fly
+64㎡
+はしめまして
+441
+curry
+##pop
+のこ
+release
+##←
+##◆◆
+##cast
+073
+ありな
+500ml
+##ews
+5c
+##stle
+ios7
+##ima
+787
+dog
+lenovo
+##r4
+roger
+013
+cbs
+vornado
+100m
+417
+##desk
+##クok
+##ald
+1867
+9595
+2900
+##van
+oil
+##ｘ
+some
+break
+common
+##jy
+##lines
+g7
+twice
+419
+ella
+nano
+belle
+にこ
+##mes
+##self
+##note
+jb
+##ことかてきます
+benz
+##との
+##ova
+451
+save
+##wing
+##ますのて
+kai
+りは
+##hua
+##rect
+rainer
+##unge
+448
+##0m
+adsl
+##かな
+guestname
+##uma
+##kins
+##zu
+tokichoi
+##price
+county
+##med
+##mus
+rmk
+391
+address
+vm
+えて
+openload
+##group
+##hin
+##iginal
+amg
+urban
+##oz
+jobs
+emi
+##public
+beautiful
+##sch
+album
+##dden
+##bell
+jerry
+works
+hostel
+miller
+##drive
+##rmin
+##１０
+376
+boot
+828
+##370
+##fx
+##cm～
+1885
+##nome
+##ctionary
+##oman
+##lish
+##cr
+##hm
+433
+##how
+432
+francis
+xi
+c919
+b5
+evernote
+##uc
+vga
+##3000
+coupe
+##urg
+##cca
+##uality
+019
+6g
+れる
+multi
+##また
+##ett
+em
+hey
+##ani
+##tax
+##rma
+inside
+than
+740
+leonnhurt
+##jin
+ict
+れた
+bird
+notes
+200mm
+くの
+##dical
+##lli
+result
+442
+iu
+ee
+438
+smap
+gopro
+##last
+yin
+pure
+998
+32g
+けた
+5kg
+##dan
+##rame
+mama
+##oot
+bean
+marketing
+##hur
+2l
+bella
+sync
+xuite
+##ground
+515
+discuz
+##getrelax
+##ince
+##bay
+##5s
+cj
+##イス
+gmat
+apt
+##pass
+jing
+##rix
+c4
+rich
+##とても
+niusnews
+##ello
+bag
+770
+##eting
+##mobile
+１８
+culture
+015
+##のてすか
+377
+1020
+area
+##ience
+616
+details
+gp
+universal
+silver
+dit
+はお
+private
+ddd
+u11
+kanshu
+##ified
+fung
+##nny
+dx
+##520
+tai
+475
+023
+##fr
+##lean
+3s
+##pin
+429
+##rin
+25000
+ly
+rick
+##bility
+usb3
+banner
+##baru
+##gion
+metal
+dt
+vdf
+1871
+karl
+qualcomm
+bear
+1010
+oldid
+ian
+jo
+##tors
+population
+##ernel
+1882
+mmorpg
+##mv
+##bike
+603
+##©
+ww
+friend
+##ager
+exhibition
+##del
+##pods
+fpx
+structure
+##free
+##tings
+kl
+##rley
+##copyright
+##mma
+california
+3400
+orange
+yoga
+4l
+canmake
+honey
+##anda
+##コメント
+595
+nikkie
+##ルハイト
+dhl
+publishing
+##mall
+##gnet
+20cm
+513
+##クセス
+##┅
+e88
+970
+##dog
+fishbase
+##!
+##"
+###
+##$
+##%
+##&
+##'
+##(
+##)
+##*
+##+
+##,
+##-
+##.
+##/
+##:
+##;
+##<
+##=
+##>
+##?
+##@
+##[
+##\
+##]
+##^
+##_
+##{
+##|
+##}
+##~
+##£
+##¤
+##¥
+##§
+##«
+##±
+##³
+##µ
+##·
+##¹
+##º
+##»
+##¼
+##ß
+##æ
+##÷
+##ø
+##đ
+##ŋ
+##ɔ
+##ə
+##ɡ
+##ʰ
+##ˇ
+##ˈ
+##ˊ
+##ˋ
+##ˍ
+##ː
+##˙
+##˚
+##ˢ
+##α
+##β
+##γ
+##δ
+##ε
+##η
+##θ
+##ι
+##κ
+##λ
+##μ
+##ν
+##ο
+##π
+##ρ
+##ς
+##σ
+##τ
+##υ
+##φ
+##χ
+##ψ
+##б
+##в
+##г
+##д
+##е
+##ж
+##з
+##к
+##л
+##м
+##н
+##о
+##п
+##р
+##с
+##т
+##у
+##ф
+##х
+##ц
+##ч
+##ш
+##ы
+##ь
+##і
+##ا
+##ب
+##ة
+##ت
+##د
+##ر
+##س
+##ع
+##ل
+##م
+##ن
+##ه
+##و
+##ي
+##۩
+##ก
+##ง
+##น
+##ม
+##ย
+##ร
+##อ
+##า
+##เ
+##๑
+##་
+##ღ
+##ᄀ
+##ᄁ
+##ᄂ
+##ᄃ
+##ᄅ
+##ᄆ
+##ᄇ
+##ᄈ
+##ᄉ
+##ᄋ
+##ᄌ
+##ᄎ
+##ᄏ
+##ᄐ
+##ᄑ
+##ᄒ
+##ᅢ
+##ᅣ
+##ᅥ
+##ᅦ
+##ᅧ
+##ᅨ
+##ᅪ
+##ᅬ
+##ᅭ
+##ᅮ
+##ᅯ
+##ᅲ
+##ᅳ
+##ᅴ
+##ᆷ
+##ᆸ
+##ᆺ
+##ᆻ
+##ᗜ
+##ᵃ
+##ᵉ
+##ᵍ
+##ᵏ
+##ᵐ
+##ᵒ
+##ᵘ
+##‖
+##„
+##†
+##•
+##‥
+##‧
+##
+##‰
+##′
+##″
+##‹
+##›
+##※
+##‿
+##⁄
+##ⁱ
+##⁺
+##ⁿ
+##₁
+##₃
+##₄
+##€
+##№
+##ⅰ
+##ⅱ
+##ⅲ
+##ⅳ
+##ⅴ
+##↔
+##↗
+##↘
+##⇒
+##∀
+##−
+##∕
+##∙
+##√
+##∞
+##∟
+##∠
+##∣
+##∩
+##∮
+##∶
+##∼
+##∽
+##≈
+##≒
+##≡
+##≤
+##≥
+##≦
+##≧
+##≪
+##≫
+##⊙
+##⋅
+##⋈
+##⋯
+##⌒
+##①
+##②
+##③
+##④
+##⑤
+##⑥
+##⑦
+##⑧
+##⑨
+##⑩
+##⑴
+##⑵
+##⑶
+##⑷
+##⑸
+##⒈
+##⒉
+##⒊
+##⒋
+##ⓒ
+##ⓔ
+##ⓘ
+##━
+##┃
+##┆
+##┊
+##┌
+##└
+##├
+##┣
+##═
+##║
+##╚
+##╞
+##╠
+##╭
+##╮
+##╯
+##╰
+##╱
+##╳
+##▂
+##▃
+##▅
+##▇
+##▉
+##▋
+##▌
+##▍
+##▎
+##□
+##▪
+##▫
+##▬
+##△
+##▶
+##►
+##▽
+##◇
+##◕
+##◠
+##◢
+##◤
+##☀
+##☕
+##☞
+##☺
+##☼
+##♀
+##♂
+##♠
+##♡
+##♣
+##♦
+##♫
+##♬
+##✈
+##✔
+##✕
+##✖
+##✦
+##✨
+##✪
+##✰
+##✿
+##❀
+##➜
+##➤
+##⦿
+##、
+##。
+##〃
+##々
+##〇
+##〈
+##〉
+##《
+##》
+##「
+##」
+##『
+##』
+##【
+##】
+##〓
+##〔
+##〕
+##〖
+##〗
+##〜
+##〝
+##〞
+##ぃ
+##ぇ
+##ぬ
+##ふ
+##ほ
+##む
+##ゃ
+##ゅ
+##ゆ
+##ょ
+##゜
+##ゝ
+##ァ
+##ゥ
+##エ
+##ォ
+##ケ
+##サ
+##セ
+##ソ
+##ッ
+##ニ
+##ヌ
+##ネ
+##ノ
+##ヘ
+##モ
+##ャ
+##ヤ
+##ュ
+##ユ
+##ョ
+##ヨ
+##ワ
+##ヲ
+##・
+##ヽ
+##ㄅ
+##ㄆ
+##ㄇ
+##ㄉ
+##ㄋ
+##ㄌ
+##ㄍ
+##ㄎ
+##ㄏ
+##ㄒ
+##ㄚ
+##ㄛ
+##ㄞ
+##ㄟ
+##ㄢ
+##ㄤ
+##ㄥ
+##ㄧ
+##ㄨ
+##ㆍ
+##㈦
+##㊣
+##㗎
+##一
+##丁
+##七
+##万
+##丈
+##三
+##上
+##下
+##不
+##与
+##丐
+##丑
+##专
+##且
+##丕
+##世
+##丘
+##丙
+##业
+##丛
+##东
+##丝
+##丞
+##丟
+##両
+##丢
+##两
+##严
+##並
+##丧
+##丨
+##个
+##丫
+##中
+##丰
+##串
+##临
+##丶
+##丸
+##丹
+##为
+##主
+##丼
+##丽
+##举
+##丿
+##乂
+##乃
+##久
+##么
+##义
+##之
+##乌
+##乍
+##乎
+##乏
+##乐
+##乒
+##乓
+##乔
+##乖
+##乗
+##乘
+##乙
+##乜
+##九
+##乞
+##也
+##习
+##乡
+##书
+##乩
+##买
+##乱
+##乳
+##乾
+##亀
+##亂
+##了
+##予
+##争
+##事
+##二
+##于
+##亏
+##云
+##互
+##五
+##井
+##亘
+##亙
+##亚
+##些
+##亜
+##亞
+##亟
+##亡
+##亢
+##交
+##亥
+##亦
+##产
+##亨
+##亩
+##享
+##京
+##亭
+##亮
+##亲
+##亳
+##亵
+##人
+##亿
+##什
+##仁
+##仃
+##仄
+##仅
+##仆
+##仇
+##今
+##介
+##仍
+##从
+##仏
+##仑
+##仓
+##仔
+##仕
+##他
+##仗
+##付
+##仙
+##仝
+##仞
+##仟
+##代
+##令
+##以
+##仨
+##仪
+##们
+##仮
+##仰
+##仲
+##件
+##价
+##任
+##份
+##仿
+##企
+##伉
+##伊
+##伍
+##伎
+##伏
+##伐
+##休
+##伕
+##众
+##优
+##伙
+##会
+##伝
+##伞
+##伟
+##传
+##伢
+##伤
+##伦
+##伪
+##伫
+##伯
+##估
+##伴
+##伶
+##伸
+##伺
+##似
+##伽
+##佃
+##但
+##佇
+##佈
+##位
+##低
+##住
+##佐
+##佑
+##体
+##佔
+##何
+##佗
+##佘
+##余
+##佚
+##佛
+##作
+##佝
+##佞
+##佟
+##你
+##佢
+##佣
+##佤
+##佥
+##佩
+##佬
+##佯
+##佰
+##佳
+##併
+##佶
+##佻
+##佼
+##使
+##侃
+##侄
+##來
+##侈
+##例
+##侍
+##侏
+##侑
+##侖
+##侗
+##供
+##依
+##侠
+##価
+##侣
+##侥
+##侦
+##侧
+##侨
+##侬
+##侮
+##侯
+##侵
+##侶
+##侷
+##便
+##係
+##促
+##俄
+##俊
+##俎
+##俏
+##俐
+##俑
+##俗
+##俘
+##俚
+##保
+##俞
+##俟
+##俠
+##信
+##俨
+##俩
+##俪
+##俬
+##俭
+##修
+##俯
+##俱
+##俳
+##俸
+##俺
+##俾
+##倆
+##倉
+##個
+##倌
+##倍
+##倏
+##們
+##倒
+##倔
+##倖
+##倘
+##候
+##倚
+##倜
+##借
+##倡
+##値
+##倦
+##倩
+##倪
+##倫
+##倬
+##倭
+##倶
+##债
+##值
+##倾
+##偃
+##假
+##偈
+##偉
+##偌
+##偎
+##偏
+##偕
+##做
+##停
+##健
+##側
+##偵
+##偶
+##偷
+##偻
+##偽
+##偿
+##傀
+##傅
+##傍
+##傑
+##傘
+##備
+##傚
+##傢
+##傣
+##傥
+##储
+##傩
+##催
+##傭
+##傲
+##傳
+##債
+##傷
+##傻
+##傾
+##僅
+##働
+##像
+##僑
+##僕
+##僖
+##僚
+##僥
+##僧
+##僭
+##僮
+##僱
+##僵
+##價
+##僻
+##儀
+##儂
+##億
+##儆
+##儉
+##儋
+##儒
+##儕
+##儘
+##償
+##儡
+##優
+##儲
+##儷
+##儼
+##儿
+##兀
+##允
+##元
+##兄
+##充
+##兆
+##兇
+##先
+##光
+##克
+##兌
+##免
+##児
+##兑
+##兒
+##兔
+##兖
+##党
+##兜
+##兢
+##入
+##內
+##全
+##兩
+##八
+##公
+##六
+##兮
+##兰
+##共
+##兲
+##关
+##兴
+##兵
+##其
+##具
+##典
+##兹
+##养
+##兼
+##兽
+##冀
+##内
+##円
+##冇
+##冈
+##冉
+##冊
+##册
+##再
+##冏
+##冒
+##冕
+##冗
+##写
+##军
+##农
+##冠
+##冢
+##冤
+##冥
+##冨
+##冪
+##冬
+##冯
+##冰
+##冲
+##决
+##况
+##冶
+##冷
+##冻
+##冼
+##冽
+##冾
+##净
+##凄
+##准
+##凇
+##凈
+##凉
+##凋
+##凌
+##凍
+##减
+##凑
+##凛
+##凜
+##凝
+##几
+##凡
+##凤
+##処
+##凪
+##凭
+##凯
+##凰
+##凱
+##凳
+##凶
+##凸
+##凹
+##出
+##击
+##函
+##凿
+##刀
+##刁
+##刃
+##分
+##切
+##刈
+##刊
+##刍
+##刎
+##刑
+##划
+##列
+##刘
+##则
+##刚
+##创
+##初
+##删
+##判
+##別
+##刨
+##利
+##刪
+##别
+##刮
+##到
+##制
+##刷
+##券
+##刹
+##刺
+##刻
+##刽
+##剁
+##剂
+##剃
+##則
+##剉
+##削
+##剋
+##剌
+##前
+##剎
+##剐
+##剑
+##剔
+##剖
+##剛
+##剜
+##剝
+##剣
+##剤
+##剥
+##剧
+##剩
+##剪
+##副
+##割
+##創
+##剷
+##剽
+##剿
+##劃
+##劇
+##劈
+##劉
+##劊
+##劍
+##劏
+##劑
+##力
+##劝
+##办
+##功
+##加
+##务
+##劣
+##动
+##助
+##努
+##劫
+##劭
+##励
+##劲
+##劳
+##労
+##劵
+##効
+##劾
+##势
+##勁
+##勃
+##勇
+##勉
+##勋
+##勐
+##勒
+##動
+##勖
+##勘
+##務
+##勛
+##勝
+##勞
+##募
+##勢
+##勤
+##勧
+##勳
+##勵
+##勸
+##勺
+##勻
+##勾
+##勿
+##匀
+##包
+##匆
+##匈
+##匍
+##匐
+##匕
+##化
+##北
+##匙
+##匝
+##匠
+##匡
+##匣
+##匪
+##匮
+##匯
+##匱
+##匹
+##区
+##医
+##匾
+##匿
+##區
+##十
+##千
+##卅
+##升
+##午
+##卉
+##半
+##卍
+##华
+##协
+##卑
+##卒
+##卓
+##協
+##单
+##卖
+##南
+##単
+##博
+##卜
+##卞
+##卟
+##占
+##卡
+##卢
+##卤
+##卦
+##卧
+##卫
+##卮
+##卯
+##印
+##危
+##即
+##却
+##卵
+##卷
+##卸
+##卻
+##卿
+##厂
+##厄
+##厅
+##历
+##厉
+##压
+##厌
+##厕
+##厘
+##厚
+##厝
+##原
+##厢
+##厥
+##厦
+##厨
+##厩
+##厭
+##厮
+##厲
+##厳
+##去
+##县
+##叁
+##参
+##參
+##又
+##叉
+##及
+##友
+##双
+##反
+##収
+##发
+##叔
+##取
+##受
+##变
+##叙
+##叛
+##叟
+##叠
+##叡
+##叢
+##口
+##古
+##句
+##另
+##叨
+##叩
+##只
+##叫
+##召
+##叭
+##叮
+##可
+##台
+##叱
+##史
+##右
+##叵
+##叶
+##号
+##司
+##叹
+##叻
+##叼
+##叽
+##吁
+##吃
+##各
+##吆
+##合
+##吉
+##吊
+##吋
+##同
+##名
+##后
+##吏
+##吐
+##向
+##吒
+##吓
+##吕
+##吖
+##吗
+##君
+##吝
+##吞
+##吟
+##吠
+##吡
+##否
+##吧
+##吨
+##吩
+##含
+##听
+##吭
+##吮
+##启
+##吱
+##吳
+##吴
+##吵
+##吶
+##吸
+##吹
+##吻
+##吼
+##吽
+##吾
+##呀
+##呂
+##呃
+##呆
+##呈
+##告
+##呋
+##呎
+##呐
+##呓
+##呕
+##呗
+##员
+##呛
+##呜
+##呢
+##呤
+##呦
+##周
+##呱
+##呲
+##味
+##呵
+##呷
+##呸
+##呻
+##呼
+##命
+##咀
+##咁
+##咂
+##咄
+##咆
+##咋
+##和
+##咎
+##咏
+##咐
+##咒
+##咔
+##咕
+##咖
+##咗
+##咘
+##咙
+##咚
+##咛
+##咣
+##咤
+##咦
+##咧
+##咨
+##咩
+##咪
+##咫
+##咬
+##咭
+##咯
+##咱
+##咲
+##咳
+##咸
+##咻
+##咽
+##咿
+##哀
+##品
+##哂
+##哄
+##哆
+##哇
+##哈
+##哉
+##哋
+##哌
+##响
+##哎
+##哏
+##哐
+##哑
+##哒
+##哔
+##哗
+##哟
+##員
+##哥
+##哦
+##哧
+##哨
+##哩
+##哪
+##哭
+##哮
+##哲
+##哺
+##哼
+##哽
+##唁
+##唄
+##唆
+##唇
+##唉
+##唏
+##唐
+##唑
+##唔
+##唠
+##唤
+##唧
+##唬
+##售
+##唯
+##唰
+##唱
+##唳
+##唷
+##唸
+##唾
+##啃
+##啄
+##商
+##啉
+##啊
+##問
+##啓
+##啕
+##啖
+##啜
+##啞
+##啟
+##啡
+##啤
+##啥
+##啦
+##啧
+##啪
+##啫
+##啬
+##啮
+##啰
+##啱
+##啲
+##啵
+##啶
+##啷
+##啸
+##啻
+##啼
+##啾
+##喀
+##喂
+##喃
+##善
+##喆
+##喇
+##喉
+##喊
+##喋
+##喎
+##喏
+##喔
+##喘
+##喙
+##喚
+##喜
+##喝
+##喟
+##喧
+##喪
+##喫
+##喬
+##單
+##喰
+##喱
+##喲
+##喳
+##喵
+##営
+##喷
+##喹
+##喺
+##喻
+##喽
+##嗅
+##嗆
+##嗇
+##嗎
+##嗑
+##嗒
+##嗓
+##嗔
+##嗖
+##嗚
+##嗜
+##嗝
+##嗟
+##嗡
+##嗣
+##嗤
+##嗦
+##嗨
+##嗪
+##嗬
+##嗯
+##嗰
+##嗲
+##嗳
+##嗶
+##嗷
+##嗽
+##嘀
+##嘅
+##嘆
+##嘈
+##嘉
+##嘌
+##嘍
+##嘎
+##嘔
+##嘖
+##嘗
+##嘘
+##嘚
+##嘛
+##嘜
+##嘞
+##嘟
+##嘢
+##嘣
+##嘤
+##嘧
+##嘩
+##嘭
+##嘮
+##嘯
+##嘰
+##嘱
+##嘲
+##嘴
+##嘶
+##嘸
+##嘹
+##嘻
+##嘿
+##噁
+##噌
+##噎
+##噓
+##噔
+##噗
+##噙
+##噜
+##噠
+##噢
+##噤
+##器
+##噩
+##噪
+##噬
+##噱
+##噴
+##噶
+##噸
+##噹
+##噻
+##噼
+##嚀
+##嚇
+##嚎
+##嚏
+##嚐
+##嚓
+##嚕
+##嚟
+##嚣
+##嚥
+##嚨
+##嚮
+##嚴
+##嚷
+##嚼
+##囂
+##囉
+##囊
+##囍
+##囑
+##囔
+##囗
+##囚
+##四
+##囝
+##回
+##囟
+##因
+##囡
+##团
+##団
+##囤
+##囧
+##囪
+##囫
+##园
+##困
+##囱
+##囲
+##図
+##围
+##囹
+##固
+##国
+##图
+##囿
+##圃
+##圄
+##圆
+##圈
+##國
+##圍
+##圏
+##園
+##圓
+##圖
+##團
+##圜
+##土
+##圣
+##圧
+##在
+##圩
+##圭
+##地
+##圳
+##场
+##圻
+##圾
+##址
+##坂
+##均
+##坊
+##坍
+##坎
+##坏
+##坐
+##坑
+##块
+##坚
+##坛
+##坝
+##坞
+##坟
+##坠
+##坡
+##坤
+##坦
+##坨
+##坪
+##坯
+##坳
+##坵
+##坷
+##垂
+##垃
+##垄
+##型
+##垒
+##垚
+##垛
+##垠
+##垢
+##垣
+##垦
+##垩
+##垫
+##垭
+##垮
+##垵
+##埂
+##埃
+##埋
+##城
+##埔
+##埕
+##埗
+##域
+##埠
+##埤
+##埵
+##執
+##埸
+##培
+##基
+##埼
+##堀
+##堂
+##堃
+##堅
+##堆
+##堇
+##堑
+##堕
+##堙
+##堡
+##堤
+##堪
+##堯
+##堰
+##報
+##場
+##堵
+##堺
+##堿
+##塊
+##塌
+##塑
+##塔
+##塗
+##塘
+##塚
+##塞
+##塢
+##塩
+##填
+##塬
+##塭
+##塵
+##塾
+##墀
+##境
+##墅
+##墉
+##墊
+##墒
+##墓
+##増
+##墘
+##墙
+##墜
+##增
+##墟
+##墨
+##墩
+##墮
+##墳
+##墻
+##墾
+##壁
+##壅
+##壆
+##壇
+##壊
+##壑
+##壓
+##壕
+##壘
+##壞
+##壟
+##壢
+##壤
+##壩
+##士
+##壬
+##壮
+##壯
+##声
+##売
+##壳
+##壶
+##壹
+##壺
+##壽
+##处
+##备
+##変
+##复
+##夏
+##夔
+##夕
+##外
+##夙
+##多
+##夜
+##够
+##夠
+##夢
+##夥
+##大
+##天
+##太
+##夫
+##夭
+##央
+##夯
+##失
+##头
+##夷
+##夸
+##夹
+##夺
+##夾
+##奂
+##奄
+##奇
+##奈
+##奉
+##奋
+##奎
+##奏
+##奐
+##契
+##奔
+##奕
+##奖
+##套
+##奘
+##奚
+##奠
+##奢
+##奥
+##奧
+##奪
+##奬
+##奮
+##女
+##奴
+##奶
+##奸
+##她
+##好
+##如
+##妃
+##妄
+##妆
+##妇
+##妈
+##妊
+##妍
+##妒
+##妓
+##妖
+##妘
+##妙
+##妝
+##妞
+##妣
+##妤
+##妥
+##妨
+##妩
+##妪
+##妮
+##妲
+##妳
+##妹
+##妻
+##妾
+##姆
+##姉
+##姊
+##始
+##姍
+##姐
+##姑
+##姒
+##姓
+##委
+##姗
+##姚
+##姜
+##姝
+##姣
+##姥
+##姦
+##姨
+##姪
+##姫
+##姬
+##姹
+##姻
+##姿
+##威
+##娃
+##娄
+##娅
+##娆
+##娇
+##娉
+##娑
+##娓
+##娘
+##娛
+##娜
+##娟
+##娠
+##娣
+##娥
+##娩
+##娱
+##娲
+##娴
+##娶
+##娼
+##婀
+##婁
+##婆
+##婉
+##婊
+##婕
+##婚
+##婢
+##婦
+##婧
+##婪
+##婭
+##婴
+##婵
+##婶
+##婷
+##婺
+##婿
+##媒
+##媚
+##媛
+##媞
+##媧
+##媲
+##媳
+##媽
+##媾
+##嫁
+##嫂
+##嫉
+##嫌
+##嫑
+##嫔
+##嫖
+##嫘
+##嫚
+##嫡
+##嫣
+##嫦
+##嫩
+##嫲
+##嫵
+##嫻
+##嬅
+##嬉
+##嬌
+##嬗
+##嬛
+##嬢
+##嬤
+##嬪
+##嬰
+##嬴
+##嬷
+##嬸
+##嬿
+##孀
+##孃
+##子
+##孑
+##孔
+##孕
+##孖
+##字
+##存
+##孙
+##孚
+##孛
+##孜
+##孝
+##孟
+##孢
+##季
+##孤
+##学
+##孩
+##孪
+##孫
+##孬
+##孰
+##孱
+##孳
+##孵
+##學
+##孺
+##孽
+##孿
+##宁
+##它
+##宅
+##宇
+##守
+##安
+##宋
+##完
+##宏
+##宓
+##宕
+##宗
+##官
+##宙
+##定
+##宛
+##宜
+##宝
+##实
+##実
+##宠
+##审
+##客
+##宣
+##室
+##宥
+##宦
+##宪
+##宫
+##宮
+##宰
+##害
+##宴
+##宵
+##家
+##宸
+##容
+##宽
+##宾
+##宿
+##寂
+##寄
+##寅
+##密
+##寇
+##富
+##寐
+##寒
+##寓
+##寛
+##寝
+##寞
+##察
+##寡
+##寢
+##寥
+##實
+##寧
+##寨
+##審
+##寫
+##寬
+##寮
+##寰
+##寵
+##寶
+##寸
+##对
+##寺
+##寻
+##导
+##対
+##寿
+##封
+##専
+##射
+##将
+##將
+##專
+##尉
+##尊
+##尋
+##對
+##導
+##小
+##少
+##尔
+##尕
+##尖
+##尘
+##尚
+##尝
+##尤
+##尧
+##尬
+##就
+##尴
+##尷
+##尸
+##尹
+##尺
+##尻
+##尼
+##尽
+##尾
+##尿
+##局
+##屁
+##层
+##屄
+##居
+##屆
+##屈
+##屉
+##届
+##屋
+##屌
+##屍
+##屎
+##屏
+##屐
+##屑
+##展
+##屜
+##属
+##屠
+##屡
+##屢
+##層
+##履
+##屬
+##屯
+##山
+##屹
+##屿
+##岀
+##岁
+##岂
+##岌
+##岐
+##岑
+##岔
+##岖
+##岗
+##岘
+##岙
+##岚
+##岛
+##岡
+##岩
+##岫
+##岬
+##岭
+##岱
+##岳
+##岷
+##岸
+##峇
+##峋
+##峒
+##峙
+##峡
+##峤
+##峥
+##峦
+##峨
+##峪
+##峭
+##峯
+##峰
+##峴
+##島
+##峻
+##峽
+##崁
+##崂
+##崆
+##崇
+##崎
+##崑
+##崔
+##崖
+##崗
+##崙
+##崛
+##崧
+##崩
+##崭
+##崴
+##崽
+##嵇
+##嵊
+##嵋
+##嵌
+##嵐
+##嵘
+##嵩
+##嵬
+##嵯
+##嶂
+##嶄
+##嶇
+##嶋
+##嶙
+##嶺
+##嶼
+##嶽
+##巅
+##巍
+##巒
+##巔
+##巖
+##川
+##州
+##巡
+##巢
+##工
+##左
+##巧
+##巨
+##巩
+##巫
+##差
+##己
+##已
+##巳
+##巴
+##巷
+##巻
+##巽
+##巾
+##巿
+##币
+##市
+##布
+##帅
+##帆
+##师
+##希
+##帐
+##帑
+##帕
+##帖
+##帘
+##帚
+##帛
+##帜
+##帝
+##帥
+##带
+##帧
+##師
+##席
+##帮
+##帯
+##帰
+##帳
+##帶
+##帷
+##常
+##帼
+##帽
+##幀
+##幂
+##幄
+##幅
+##幌
+##幔
+##幕
+##幟
+##幡
+##幢
+##幣
+##幫
+##干
+##平
+##年
+##并
+##幸
+##幹
+##幺
+##幻
+##幼
+##幽
+##幾
+##广
+##庁
+##広
+##庄
+##庆
+##庇
+##床
+##序
+##庐
+##库
+##应
+##底
+##庖
+##店
+##庙
+##庚
+##府
+##庞
+##废
+##庠
+##度
+##座
+##庫
+##庭
+##庵
+##庶
+##康
+##庸
+##庹
+##庾
+##廁
+##廂
+##廃
+##廈
+##廉
+##廊
+##廓
+##廖
+##廚
+##廝
+##廟
+##廠
+##廢
+##廣
+##廬
+##廳
+##延
+##廷
+##建
+##廿
+##开
+##弁
+##异
+##弃
+##弄
+##弈
+##弊
+##弋
+##式
+##弑
+##弒
+##弓
+##弔
+##引
+##弗
+##弘
+##弛
+##弟
+##张
+##弥
+##弦
+##弧
+##弩
+##弭
+##弯
+##弱
+##張
+##強
+##弹
+##强
+##弼
+##弾
+##彅
+##彆
+##彈
+##彌
+##彎
+##归
+##当
+##录
+##彗
+##彙
+##彝
+##形
+##彤
+##彥
+##彦
+##彧
+##彩
+##彪
+##彫
+##彬
+##彭
+##彰
+##影
+##彷
+##役
+##彻
+##彼
+##彿
+##往
+##征
+##径
+##待
+##徇
+##很
+##徉
+##徊
+##律
+##後
+##徐
+##徑
+##徒
+##従
+##徕
+##得
+##徘
+##徙
+##徜
+##從
+##徠
+##御
+##徨
+##復
+##循
+##徬
+##微
+##徳
+##徴
+##徵
+##德
+##徹
+##徼
+##徽
+##心
+##必
+##忆
+##忌
+##忍
+##忏
+##忐
+##忑
+##忒
+##忖
+##志
+##忘
+##忙
+##応
+##忠
+##忡
+##忤
+##忧
+##忪
+##快
+##忱
+##念
+##忻
+##忽
+##忿
+##怀
+##态
+##怂
+##怅
+##怆
+##怎
+##怏
+##怒
+##怔
+##怕
+##怖
+##怙
+##怜
+##思
+##怠
+##怡
+##急
+##怦
+##性
+##怨
+##怪
+##怯
+##怵
+##总
+##怼
+##恁
+##恃
+##恆
+##恋
+##恍
+##恐
+##恒
+##恕
+##恙
+##恚
+##恢
+##恣
+##恤
+##恥
+##恨
+##恩
+##恪
+##恫
+##恬
+##恭
+##息
+##恰
+##恳
+##恵
+##恶
+##恸
+##恺
+##恻
+##恼
+##恿
+##悄
+##悅
+##悉
+##悌
+##悍
+##悔
+##悖
+##悚
+##悟
+##悠
+##患
+##悦
+##您
+##悩
+##悪
+##悬
+##悯
+##悱
+##悲
+##悴
+##悵
+##悶
+##悸
+##悻
+##悼
+##悽
+##情
+##惆
+##惇
+##惊
+##惋
+##惑
+##惕
+##惘
+##惚
+##惜
+##惟
+##惠
+##惡
+##惦
+##惧
+##惨
+##惩
+##惫
+##惬
+##惭
+##惮
+##惯
+##惰
+##惱
+##想
+##惴
+##惶
+##惹
+##惺
+##愁
+##愆
+##愈
+##愉
+##愍
+##意
+##愕
+##愚
+##愛
+##愜
+##感
+##愣
+##愤
+##愧
+##愫
+##愷
+##愿
+##慄
+##慈
+##態
+##慌
+##慎
+##慑
+##慕
+##慘
+##慚
+##慟
+##慢
+##慣
+##慧
+##慨
+##慫
+##慮
+##慰
+##慳
+##慵
+##慶
+##慷
+##慾
+##憂
+##憊
+##憋
+##憎
+##憐
+##憑
+##憔
+##憚
+##憤
+##憧
+##憨
+##憩
+##憫
+##憬
+##憲
+##憶
+##憾
+##懂
+##懇
+##懈
+##應
+##懊
+##懋
+##懑
+##懒
+##懦
+##懲
+##懵
+##懶
+##懷
+##懸
+##懺
+##懼
+##懾
+##懿
+##戀
+##戈
+##戊
+##戌
+##戍
+##戎
+##戏
+##成
+##我
+##戒
+##戕
+##或
+##战
+##戚
+##戛
+##戟
+##戡
+##戦
+##截
+##戬
+##戮
+##戰
+##戲
+##戳
+##戴
+##戶
+##户
+##戸
+##戻
+##戾
+##房
+##所
+##扁
+##扇
+##扈
+##扉
+##手
+##才
+##扎
+##扑
+##扒
+##打
+##扔
+##払
+##托
+##扛
+##扣
+##扦
+##执
+##扩
+##扪
+##扫
+##扬
+##扭
+##扮
+##扯
+##扰
+##扱
+##扳
+##扶
+##批
+##扼
+##找
+##承
+##技
+##抄
+##抉
+##把
+##抑
+##抒
+##抓
+##投
+##抖
+##抗
+##折
+##抚
+##抛
+##抜
+##択
+##抟
+##抠
+##抡
+##抢
+##护
+##报
+##抨
+##披
+##抬
+##抱
+##抵
+##抹
+##押
+##抽
+##抿
+##拂
+##拄
+##担
+##拆
+##拇
+##拈
+##拉
+##拋
+##拌
+##拍
+##拎
+##拐
+##拒
+##拓
+##拔
+##拖
+##拗
+##拘
+##拙
+##拚
+##招
+##拜
+##拟
+##拡
+##拢
+##拣
+##拥
+##拦
+##拧
+##拨
+##择
+##括
+##拭
+##拮
+##拯
+##拱
+##拳
+##拴
+##拷
+##拼
+##拽
+##拾
+##拿
+##持
+##挂
+##指
+##挈
+##按
+##挎
+##挑
+##挖
+##挙
+##挚
+##挛
+##挝
+##挞
+##挟
+##挠
+##挡
+##挣
+##挤
+##挥
+##挨
+##挪
+##挫
+##振
+##挲
+##挹
+##挺
+##挽
+##挾
+##捂
+##捅
+##捆
+##捉
+##捋
+##捌
+##捍
+##捎
+##捏
+##捐
+##捕
+##捞
+##损
+##捡
+##换
+##捣
+##捧
+##捨
+##捩
+##据
+##捱
+##捲
+##捶
+##捷
+##捺
+##捻
+##掀
+##掂
+##掃
+##掇
+##授
+##掉
+##掌
+##掏
+##掐
+##排
+##掖
+##掘
+##掙
+##掛
+##掠
+##採
+##探
+##掣
+##接
+##控
+##推
+##掩
+##措
+##掬
+##掰
+##掲
+##掳
+##掴
+##掷
+##掸
+##掺
+##揀
+##揃
+##揄
+##揆
+##揉
+##揍
+##描
+##提
+##插
+##揖
+##揚
+##換
+##握
+##揣
+##揩
+##揪
+##揭
+##揮
+##援
+##揶
+##揸
+##揹
+##揽
+##搀
+##搁
+##搂
+##搅
+##損
+##搏
+##搐
+##搓
+##搔
+##搖
+##搗
+##搜
+##搞
+##搡
+##搪
+##搬
+##搭
+##搵
+##搶
+##携
+##搽
+##摀
+##摁
+##摄
+##摆
+##摇
+##摈
+##摊
+##摒
+##摔
+##摘
+##摞
+##摟
+##摧
+##摩
+##摯
+##摳
+##摸
+##摹
+##摺
+##摻
+##撂
+##撃
+##撅
+##撇
+##撈
+##撐
+##撑
+##撒
+##撓
+##撕
+##撚
+##撞
+##撤
+##撥
+##撩
+##撫
+##撬
+##播
+##撮
+##撰
+##撲
+##撵
+##撷
+##撸
+##撻
+##撼
+##撿
+##擀
+##擁
+##擂
+##擄
+##擅
+##擇
+##擊
+##擋
+##操
+##擎
+##擒
+##擔
+##擘
+##據
+##擞
+##擠
+##擡
+##擢
+##擦
+##擬
+##擰
+##擱
+##擲
+##擴
+##擷
+##擺
+##擼
+##擾
+##攀
+##攏
+##攒
+##攔
+##攘
+##攙
+##攜
+##攝
+##攞
+##攢
+##攣
+##攤
+##攥
+##攪
+##攫
+##攬
+##支
+##收
+##攸
+##改
+##攻
+##放
+##政
+##故
+##效
+##敌
+##敍
+##敎
+##敏
+##救
+##敕
+##敖
+##敗
+##敘
+##教
+##敛
+##敝
+##敞
+##敢
+##散
+##敦
+##敬
+##数
+##敲
+##整
+##敵
+##敷
+##數
+##斂
+##斃
+##文
+##斋
+##斌
+##斎
+##斐
+##斑
+##斓
+##斗
+##料
+##斛
+##斜
+##斟
+##斡
+##斤
+##斥
+##斧
+##斩
+##斫
+##斬
+##断
+##斯
+##新
+##斷
+##方
+##於
+##施
+##旁
+##旃
+##旅
+##旋
+##旌
+##旎
+##族
+##旖
+##旗
+##无
+##既
+##日
+##旦
+##旧
+##旨
+##早
+##旬
+##旭
+##旮
+##旱
+##时
+##旷
+##旺
+##旻
+##昀
+##昂
+##昆
+##昇
+##昉
+##昊
+##昌
+##明
+##昏
+##易
+##昔
+##昕
+##昙
+##星
+##映
+##春
+##昧
+##昨
+##昭
+##是
+##昱
+##昴
+##昵
+##昶
+##昼
+##显
+##晁
+##時
+##晃
+##晉
+##晋
+##晌
+##晏
+##晒
+##晓
+##晔
+##晕
+##晖
+##晗
+##晚
+##晝
+##晞
+##晟
+##晤
+##晦
+##晨
+##晩
+##普
+##景
+##晰
+##晴
+##晶
+##晷
+##智
+##晾
+##暂
+##暄
+##暇
+##暈
+##暉
+##暌
+##暐
+##暑
+##暖
+##暗
+##暝
+##暢
+##暧
+##暨
+##暫
+##暮
+##暱
+##暴
+##暸
+##暹
+##曄
+##曆
+##曇
+##曉
+##曖
+##曙
+##曜
+##曝
+##曠
+##曦
+##曬
+##曰
+##曲
+##曳
+##更
+##書
+##曹
+##曼
+##曾
+##替
+##最
+##會
+##月
+##有
+##朋
+##服
+##朐
+##朔
+##朕
+##朗
+##望
+##朝
+##期
+##朦
+##朧
+##木
+##未
+##末
+##本
+##札
+##朮
+##术
+##朱
+##朴
+##朵
+##机
+##朽
+##杀
+##杂
+##权
+##杆
+##杈
+##杉
+##李
+##杏
+##材
+##村
+##杓
+##杖
+##杜
+##杞
+##束
+##杠
+##条
+##来
+##杨
+##杭
+##杯
+##杰
+##東
+##杳
+##杵
+##杷
+##杼
+##松
+##板
+##极
+##构
+##枇
+##枉
+##枋
+##析
+##枕
+##林
+##枚
+##果
+##枝
+##枢
+##枣
+##枪
+##枫
+##枭
+##枯
+##枰
+##枱
+##枳
+##架
+##枷
+##枸
+##柄
+##柏
+##某
+##柑
+##柒
+##染
+##柔
+##柘
+##柚
+##柜
+##柞
+##柠
+##柢
+##查
+##柩
+##柬
+##柯
+##柱
+##柳
+##柴
+##柵
+##査
+##柿
+##栀
+##栃
+##栄
+##栅
+##标
+##栈
+##栉
+##栋
+##栎
+##栏
+##树
+##栓
+##栖
+##栗
+##校
+##栩
+##株
+##样
+##核
+##根
+##格
+##栽
+##栾
+##桀
+##桁
+##桂
+##桃
+##桅
+##框
+##案
+##桉
+##桌
+##桎
+##桐
+##桑
+##桓
+##桔
+##桜
+##桠
+##桡
+##桢
+##档
+##桥
+##桦
+##桧
+##桨
+##桩
+##桶
+##桿
+##梁
+##梅
+##梆
+##梏
+##梓
+##梗
+##條
+##梟
+##梢
+##梦
+##梧
+##梨
+##梭
+##梯
+##械
+##梳
+##梵
+##梶
+##检
+##棂
+##棄
+##棉
+##棋
+##棍
+##棒
+##棕
+##棗
+##棘
+##棚
+##棟
+##棠
+##棣
+##棧
+##森
+##棱
+##棲
+##棵
+##棹
+##棺
+##椁
+##椅
+##椋
+##植
+##椎
+##椒
+##検
+##椪
+##椭
+##椰
+##椹
+##椽
+##椿
+##楂
+##楊
+##楓
+##楔
+##楚
+##楝
+##楞
+##楠
+##楣
+##楨
+##楫
+##業
+##楮
+##極
+##楷
+##楸
+##楹
+##楼
+##楽
+##概
+##榄
+##榆
+##榈
+##榉
+##榔
+##榕
+##榖
+##榛
+##榜
+##榨
+##榫
+##榭
+##榮
+##榱
+##榴
+##榷
+##榻
+##槁
+##槃
+##構
+##槌
+##槍
+##槎
+##槐
+##槓
+##様
+##槛
+##槟
+##槤
+##槭
+##槲
+##槳
+##槻
+##槽
+##槿
+##樁
+##樂
+##樊
+##樑
+##樓
+##標
+##樞
+##樟
+##模
+##樣
+##権
+##横
+##樫
+##樯
+##樱
+##樵
+##樸
+##樹
+##樺
+##樽
+##樾
+##橄
+##橇
+##橋
+##橐
+##橘
+##橙
+##機
+##橡
+##橢
+##橫
+##橱
+##橹
+##橼
+##檀
+##檄
+##檎
+##檐
+##檔
+##檗
+##檜
+##檢
+##檬
+##檯
+##檳
+##檸
+##檻
+##櫃
+##櫚
+##櫛
+##櫥
+##櫸
+##櫻
+##欄
+##權
+##欒
+##欖
+##欠
+##次
+##欢
+##欣
+##欧
+##欲
+##欸
+##欺
+##欽
+##款
+##歆
+##歇
+##歉
+##歌
+##歎
+##歐
+##歓
+##歙
+##歛
+##歡
+##止
+##正
+##此
+##步
+##武
+##歧
+##歩
+##歪
+##歯
+##歲
+##歳
+##歴
+##歷
+##歸
+##歹
+##死
+##歼
+##殁
+##殃
+##殆
+##殇
+##殉
+##殊
+##残
+##殒
+##殓
+##殖
+##殘
+##殞
+##殡
+##殤
+##殭
+##殯
+##殲
+##殴
+##段
+##殷
+##殺
+##殼
+##殿
+##毀
+##毁
+##毂
+##毅
+##毆
+##毋
+##母
+##毎
+##每
+##毒
+##毓
+##比
+##毕
+##毗
+##毘
+##毙
+##毛
+##毡
+##毫
+##毯
+##毽
+##氈
+##氏
+##氐
+##民
+##氓
+##气
+##氖
+##気
+##氙
+##氛
+##氟
+##氡
+##氢
+##氣
+##氤
+##氦
+##氧
+##氨
+##氪
+##氫
+##氮
+##氯
+##氰
+##氲
+##水
+##氷
+##永
+##氹
+##氾
+##汀
+##汁
+##求
+##汆
+##汇
+##汉
+##汎
+##汐
+##汕
+##汗
+##汙
+##汛
+##汝
+##汞
+##江
+##池
+##污
+##汤
+##汨
+##汩
+##汪
+##汰
+##汲
+##汴
+##汶
+##汹
+##決
+##汽
+##汾
+##沁
+##沂
+##沃
+##沅
+##沈
+##沉
+##沌
+##沏
+##沐
+##沒
+##沓
+##沖
+##沙
+##沛
+##沟
+##没
+##沢
+##沣
+##沥
+##沦
+##沧
+##沪
+##沫
+##沭
+##沮
+##沱
+##河
+##沸
+##油
+##治
+##沼
+##沽
+##沾
+##沿
+##況
+##泄
+##泉
+##泊
+##泌
+##泓
+##法
+##泗
+##泛
+##泞
+##泠
+##泡
+##波
+##泣
+##泥
+##注
+##泪
+##泫
+##泮
+##泯
+##泰
+##泱
+##泳
+##泵
+##泷
+##泸
+##泻
+##泼
+##泽
+##泾
+##洁
+##洄
+##洋
+##洒
+##洗
+##洙
+##洛
+##洞
+##津
+##洩
+##洪
+##洮
+##洱
+##洲
+##洵
+##洶
+##洸
+##洹
+##活
+##洼
+##洽
+##派
+##流
+##浃
+##浄
+##浅
+##浆
+##浇
+##浊
+##测
+##济
+##浏
+##浑
+##浒
+##浓
+##浔
+##浙
+##浚
+##浜
+##浣
+##浦
+##浩
+##浪
+##浬
+##浮
+##浯
+##浴
+##海
+##浸
+##涂
+##涅
+##涇
+##消
+##涉
+##涌
+##涎
+##涓
+##涔
+##涕
+##涙
+##涛
+##涝
+##涞
+##涟
+##涠
+##涡
+##涣
+##涤
+##润
+##涧
+##涨
+##涩
+##涪
+##涮
+##涯
+##液
+##涵
+##涸
+##涼
+##涿
+##淀
+##淄
+##淅
+##淆
+##淇
+##淋
+##淌
+##淑
+##淒
+##淖
+##淘
+##淙
+##淚
+##淞
+##淡
+##淤
+##淦
+##淨
+##淩
+##淪
+##淫
+##淬
+##淮
+##深
+##淳
+##淵
+##混
+##淹
+##淺
+##添
+##淼
+##清
+##済
+##渉
+##渊
+##渋
+##渍
+##渎
+##渐
+##渔
+##渗
+##渙
+##渚
+##減
+##渝
+##渠
+##渡
+##渣
+##渤
+##渥
+##渦
+##温
+##測
+##渭
+##港
+##渲
+##渴
+##游
+##渺
+##渾
+##湃
+##湄
+##湊
+##湍
+##湖
+##湘
+##湛
+##湟
+##湧
+##湫
+##湮
+##湯
+##湳
+##湾
+##湿
+##満
+##溃
+##溅
+##溉
+##溏
+##源
+##準
+##溜
+##溝
+##溟
+##溢
+##溥
+##溧
+##溪
+##溫
+##溯
+##溱
+##溴
+##溶
+##溺
+##溼
+##滁
+##滂
+##滄
+##滅
+##滇
+##滋
+##滌
+##滑
+##滓
+##滔
+##滕
+##滙
+##滚
+##滝
+##滞
+##滟
+##满
+##滢
+##滤
+##滥
+##滦
+##滨
+##滩
+##滬
+##滯
+##滲
+##滴
+##滷
+##滸
+##滾
+##滿
+##漁
+##漂
+##漆
+##漉
+##漏
+##漓
+##演
+##漕
+##漠
+##漢
+##漣
+##漩
+##漪
+##漫
+##漬
+##漯
+##漱
+##漲
+##漳
+##漸
+##漾
+##漿
+##潆
+##潇
+##潋
+##潍
+##潑
+##潔
+##潘
+##潛
+##潜
+##潞
+##潟
+##潢
+##潤
+##潦
+##潧
+##潭
+##潮
+##潰
+##潴
+##潸
+##潺
+##潼
+##澀
+##澄
+##澆
+##澈
+##澍
+##澎
+##澗
+##澜
+##澡
+##澤
+##澧
+##澱
+##澳
+##澹
+##激
+##濁
+##濂
+##濃
+##濑
+##濒
+##濕
+##濘
+##濛
+##濟
+##濠
+##濡
+##濤
+##濫
+##濬
+##濮
+##濯
+##濱
+##濺
+##濾
+##瀅
+##瀆
+##瀉
+##瀋
+##瀏
+##瀑
+##瀕
+##瀘
+##瀚
+##瀛
+##瀝
+##瀞
+##瀟
+##瀧
+##瀨
+##瀬
+##瀰
+##瀾
+##灌
+##灏
+##灑
+##灘
+##灝
+##灞
+##灣
+##火
+##灬
+##灭
+##灯
+##灰
+##灵
+##灶
+##灸
+##灼
+##災
+##灾
+##灿
+##炀
+##炁
+##炅
+##炉
+##炊
+##炎
+##炒
+##炔
+##炕
+##炖
+##炙
+##炜
+##炫
+##炬
+##炭
+##炮
+##炯
+##炳
+##炷
+##炸
+##点
+##為
+##炼
+##炽
+##烁
+##烂
+##烃
+##烈
+##烊
+##烏
+##烘
+##烙
+##烛
+##烟
+##烤
+##烦
+##烧
+##烨
+##烩
+##烫
+##烬
+##热
+##烯
+##烷
+##烹
+##烽
+##焉
+##焊
+##焕
+##焖
+##焗
+##焘
+##焙
+##焚
+##焜
+##無
+##焦
+##焯
+##焰
+##焱
+##然
+##焼
+##煅
+##煉
+##煊
+##煌
+##煎
+##煒
+##煖
+##煙
+##煜
+##煞
+##煤
+##煥
+##煦
+##照
+##煨
+##煩
+##煮
+##煲
+##煸
+##煽
+##熄
+##熊
+##熏
+##熒
+##熔
+##熙
+##熟
+##熠
+##熨
+##熬
+##熱
+##熵
+##熹
+##熾
+##燁
+##燃
+##燄
+##燈
+##燉
+##燊
+##燎
+##燒
+##燔
+##燕
+##燙
+##燜
+##營
+##燥
+##燦
+##燧
+##燭
+##燮
+##燴
+##燻
+##燼
+##燿
+##爆
+##爍
+##爐
+##爛
+##爪
+##爬
+##爭
+##爰
+##爱
+##爲
+##爵
+##父
+##爷
+##爸
+##爹
+##爺
+##爻
+##爽
+##爾
+##牆
+##片
+##版
+##牌
+##牍
+##牒
+##牙
+##牛
+##牝
+##牟
+##牠
+##牡
+##牢
+##牦
+##牧
+##物
+##牯
+##牲
+##牴
+##牵
+##特
+##牺
+##牽
+##犀
+##犁
+##犄
+##犊
+##犍
+##犒
+##犢
+##犧
+##犬
+##犯
+##状
+##犷
+##犸
+##犹
+##狀
+##狂
+##狄
+##狈
+##狎
+##狐
+##狒
+##狗
+##狙
+##狞
+##狠
+##狡
+##狩
+##独
+##狭
+##狮
+##狰
+##狱
+##狸
+##狹
+##狼
+##狽
+##猎
+##猕
+##猖
+##猗
+##猙
+##猛
+##猜
+##猝
+##猥
+##猩
+##猪
+##猫
+##猬
+##献
+##猴
+##猶
+##猷
+##猾
+##猿
+##獄
+##獅
+##獎
+##獐
+##獒
+##獗
+##獠
+##獣
+##獨
+##獭
+##獰
+##獲
+##獵
+##獷
+##獸
+##獺
+##獻
+##獼
+##獾
+##玄
+##率
+##玉
+##王
+##玑
+##玖
+##玛
+##玟
+##玠
+##玥
+##玩
+##玫
+##玮
+##环
+##现
+##玲
+##玳
+##玷
+##玺
+##玻
+##珀
+##珂
+##珅
+##珈
+##珉
+##珊
+##珍
+##珏
+##珐
+##珑
+##珙
+##珞
+##珠
+##珣
+##珥
+##珩
+##珪
+##班
+##珮
+##珲
+##珺
+##現
+##球
+##琅
+##理
+##琇
+##琉
+##琊
+##琍
+##琏
+##琐
+##琛
+##琢
+##琥
+##琦
+##琨
+##琪
+##琬
+##琮
+##琰
+##琲
+##琳
+##琴
+##琵
+##琶
+##琺
+##琼
+##瑀
+##瑁
+##瑄
+##瑋
+##瑕
+##瑗
+##瑙
+##瑚
+##瑛
+##瑜
+##瑞
+##瑟
+##瑠
+##瑣
+##瑤
+##瑩
+##瑪
+##瑯
+##瑰
+##瑶
+##瑾
+##璀
+##璁
+##璃
+##璇
+##璉
+##璋
+##璎
+##璐
+##璜
+##璞
+##璟
+##璧
+##璨
+##環
+##璽
+##璿
+##瓊
+##瓏
+##瓒
+##瓜
+##瓢
+##瓣
+##瓤
+##瓦
+##瓮
+##瓯
+##瓴
+##瓶
+##瓷
+##甄
+##甌
+##甕
+##甘
+##甙
+##甚
+##甜
+##生
+##產
+##産
+##甥
+##甦
+##用
+##甩
+##甫
+##甬
+##甭
+##甯
+##田
+##由
+##甲
+##申
+##电
+##男
+##甸
+##町
+##画
+##甾
+##畀
+##畅
+##界
+##畏
+##畑
+##畔
+##留
+##畜
+##畝
+##畢
+##略
+##畦
+##番
+##畫
+##異
+##畲
+##畳
+##畴
+##當
+##畸
+##畹
+##畿
+##疆
+##疇
+##疊
+##疏
+##疑
+##疔
+##疖
+##疗
+##疙
+##疚
+##疝
+##疟
+##疡
+##疣
+##疤
+##疥
+##疫
+##疮
+##疯
+##疱
+##疲
+##疳
+##疵
+##疸
+##疹
+##疼
+##疽
+##疾
+##痂
+##病
+##症
+##痈
+##痉
+##痊
+##痍
+##痒
+##痔
+##痕
+##痘
+##痙
+##痛
+##痞
+##痠
+##痢
+##痣
+##痤
+##痧
+##痨
+##痪
+##痫
+##痰
+##痱
+##痴
+##痹
+##痺
+##痼
+##痿
+##瘀
+##瘁
+##瘋
+##瘍
+##瘓
+##瘘
+##瘙
+##瘟
+##瘠
+##瘡
+##瘢
+##瘤
+##瘦
+##瘧
+##瘩
+##瘪
+##瘫
+##瘴
+##瘸
+##瘾
+##療
+##癇
+##癌
+##癒
+##癖
+##癜
+##癞
+##癡
+##癢
+##癣
+##癥
+##癫
+##癬
+##癮
+##癱
+##癲
+##癸
+##発
+##登
+##發
+##白
+##百
+##皂
+##的
+##皆
+##皇
+##皈
+##皋
+##皎
+##皑
+##皓
+##皖
+##皙
+##皚
+##皮
+##皰
+##皱
+##皴
+##皺
+##皿
+##盂
+##盃
+##盅
+##盆
+##盈
+##益
+##盎
+##盏
+##盐
+##监
+##盒
+##盔
+##盖
+##盗
+##盘
+##盛
+##盜
+##盞
+##盟
+##盡
+##監
+##盤
+##盥
+##盧
+##盪
+##目
+##盯
+##盱
+##盲
+##直
+##相
+##盹
+##盼
+##盾
+##省
+##眈
+##眉
+##看
+##県
+##眙
+##眞
+##真
+##眠
+##眦
+##眨
+##眩
+##眯
+##眶
+##眷
+##眸
+##眺
+##眼
+##眾
+##着
+##睁
+##睇
+##睏
+##睐
+##睑
+##睛
+##睜
+##睞
+##睡
+##睢
+##督
+##睥
+##睦
+##睨
+##睪
+##睫
+##睬
+##睹
+##睽
+##睾
+##睿
+##瞄
+##瞅
+##瞇
+##瞋
+##瞌
+##瞎
+##瞑
+##瞒
+##瞓
+##瞞
+##瞟
+##瞠
+##瞥
+##瞧
+##瞩
+##瞪
+##瞬
+##瞭
+##瞰
+##瞳
+##瞻
+##瞼
+##瞿
+##矇
+##矍
+##矗
+##矚
+##矛
+##矜
+##矢
+##矣
+##知
+##矩
+##矫
+##短
+##矮
+##矯
+##石
+##矶
+##矽
+##矾
+##矿
+##码
+##砂
+##砌
+##砍
+##砒
+##研
+##砖
+##砗
+##砚
+##砝
+##砣
+##砥
+##砧
+##砭
+##砰
+##砲
+##破
+##砷
+##砸
+##砺
+##砼
+##砾
+##础
+##硅
+##硐
+##硒
+##硕
+##硝
+##硫
+##硬
+##确
+##硯
+##硼
+##碁
+##碇
+##碉
+##碌
+##碍
+##碎
+##碑
+##碓
+##碗
+##碘
+##碚
+##碛
+##碟
+##碣
+##碧
+##碩
+##碰
+##碱
+##碳
+##碴
+##確
+##碼
+##碾
+##磁
+##磅
+##磊
+##磋
+##磐
+##磕
+##磚
+##磡
+##磨
+##磬
+##磯
+##磲
+##磷
+##磺
+##礁
+##礎
+##礙
+##礡
+##礦
+##礪
+##礫
+##礴
+##示
+##礼
+##社
+##祀
+##祁
+##祂
+##祇
+##祈
+##祉
+##祎
+##祐
+##祕
+##祖
+##祗
+##祚
+##祛
+##祜
+##祝
+##神
+##祟
+##祠
+##祢
+##祥
+##票
+##祭
+##祯
+##祷
+##祸
+##祺
+##祿
+##禀
+##禁
+##禄
+##禅
+##禍
+##禎
+##福
+##禛
+##禦
+##禧
+##禪
+##禮
+##禱
+##禹
+##禺
+##离
+##禽
+##禾
+##禿
+##秀
+##私
+##秃
+##秆
+##秉
+##秋
+##种
+##科
+##秒
+##秘
+##租
+##秣
+##秤
+##秦
+##秧
+##秩
+##秭
+##积
+##称
+##秸
+##移
+##秽
+##稀
+##稅
+##程
+##稍
+##税
+##稔
+##稗
+##稚
+##稜
+##稞
+##稟
+##稠
+##稣
+##種
+##稱
+##稲
+##稳
+##稷
+##稹
+##稻
+##稼
+##稽
+##稿
+##穀
+##穂
+##穆
+##穌
+##積
+##穎
+##穗
+##穢
+##穩
+##穫
+##穴
+##究
+##穷
+##穹
+##空
+##穿
+##突
+##窃
+##窄
+##窈
+##窍
+##窑
+##窒
+##窓
+##窕
+##窖
+##窗
+##窘
+##窜
+##窝
+##窟
+##窠
+##窥
+##窦
+##窨
+##窩
+##窪
+##窮
+##窯
+##窺
+##窿
+##竄
+##竅
+##竇
+##竊
+##立
+##竖
+##站
+##竜
+##竞
+##竟
+##章
+##竣
+##童
+##竭
+##端
+##競
+##竹
+##竺
+##竽
+##竿
+##笃
+##笆
+##笈
+##笋
+##笏
+##笑
+##笔
+##笙
+##笛
+##笞
+##笠
+##符
+##笨
+##第
+##笹
+##笺
+##笼
+##筆
+##等
+##筊
+##筋
+##筍
+##筏
+##筐
+##筑
+##筒
+##答
+##策
+##筛
+##筝
+##筠
+##筱
+##筲
+##筵
+##筷
+##筹
+##签
+##简
+##箇
+##箋
+##箍
+##箏
+##箐
+##箔
+##箕
+##算
+##箝
+##管
+##箩
+##箫
+##箭
+##箱
+##箴
+##箸
+##節
+##篁
+##範
+##篆
+##篇
+##築
+##篑
+##篓
+##篙
+##篝
+##篠
+##篡
+##篤
+##篩
+##篪
+##篮
+##篱
+##篷
+##簇
+##簌
+##簍
+##簡
+##簦
+##簧
+##簪
+##簫
+##簷
+##簸
+##簽
+##簾
+##簿
+##籁
+##籃
+##籌
+##籍
+##籐
+##籟
+##籠
+##籤
+##籬
+##籮
+##籲
+##米
+##类
+##籼
+##籽
+##粄
+##粉
+##粑
+##粒
+##粕
+##粗
+##粘
+##粟
+##粤
+##粥
+##粧
+##粪
+##粮
+##粱
+##粲
+##粳
+##粵
+##粹
+##粼
+##粽
+##精
+##粿
+##糅
+##糊
+##糍
+##糕
+##糖
+##糗
+##糙
+##糜
+##糞
+##糟
+##糠
+##糧
+##糬
+##糯
+##糰
+##糸
+##系
+##糾
+##紀
+##紂
+##約
+##紅
+##紉
+##紊
+##紋
+##納
+##紐
+##紓
+##純
+##紗
+##紘
+##紙
+##級
+##紛
+##紜
+##素
+##紡
+##索
+##紧
+##紫
+##紮
+##累
+##細
+##紳
+##紹
+##紺
+##終
+##絃
+##組
+##絆
+##経
+##結
+##絕
+##絞
+##絡
+##絢
+##給
+##絨
+##絮
+##統
+##絲
+##絳
+##絵
+##絶
+##絹
+##綁
+##綏
+##綑
+##經
+##継
+##続
+##綜
+##綠
+##綢
+##綦
+##綫
+##綬
+##維
+##綱
+##網
+##綴
+##綵
+##綸
+##綺
+##綻
+##綽
+##綾
+##綿
+##緊
+##緋
+##総
+##緑
+##緒
+##緘
+##線
+##緝
+##緞
+##締
+##緣
+##編
+##緩
+##緬
+##緯
+##練
+##緹
+##緻
+##縁
+##縄
+##縈
+##縛
+##縝
+##縣
+##縫
+##縮
+##縱
+##縴
+##縷
+##總
+##績
+##繁
+##繃
+##繆
+##繇
+##繋
+##織
+##繕
+##繚
+##繞
+##繡
+##繩
+##繪
+##繫
+##繭
+##繳
+##繹
+##繼
+##繽
+##纂
+##續
+##纍
+##纏
+##纓
+##纔
+##纖
+##纜
+##纠
+##红
+##纣
+##纤
+##约
+##级
+##纨
+##纪
+##纫
+##纬
+##纭
+##纯
+##纰
+##纱
+##纲
+##纳
+##纵
+##纶
+##纷
+##纸
+##纹
+##纺
+##纽
+##纾
+##线
+##绀
+##练
+##组
+##绅
+##细
+##织
+##终
+##绊
+##绍
+##绎
+##经
+##绑
+##绒
+##结
+##绔
+##绕
+##绘
+##给
+##绚
+##绛
+##络
+##绝
+##绞
+##统
+##绡
+##绢
+##绣
+##绥
+##绦
+##继
+##绩
+##绪
+##绫
+##续
+##绮
+##绯
+##绰
+##绳
+##维
+##绵
+##绶
+##绷
+##绸
+##绻
+##综
+##绽
+##绾
+##绿
+##缀
+##缄
+##缅
+##缆
+##缇
+##缈
+##缉
+##缎
+##缓
+##缔
+##缕
+##编
+##缘
+##缙
+##缚
+##缜
+##缝
+##缠
+##缢
+##缤
+##缥
+##缨
+##缩
+##缪
+##缭
+##缮
+##缰
+##缱
+##缴
+##缸
+##缺
+##缽
+##罂
+##罄
+##罌
+##罐
+##网
+##罔
+##罕
+##罗
+##罚
+##罡
+##罢
+##罩
+##罪
+##置
+##罰
+##署
+##罵
+##罷
+##罹
+##羁
+##羅
+##羈
+##羊
+##羌
+##美
+##羔
+##羚
+##羞
+##羟
+##羡
+##羣
+##群
+##羥
+##羧
+##羨
+##義
+##羯
+##羲
+##羸
+##羹
+##羽
+##羿
+##翁
+##翅
+##翊
+##翌
+##翎
+##習
+##翔
+##翘
+##翟
+##翠
+##翡
+##翦
+##翩
+##翰
+##翱
+##翳
+##翹
+##翻
+##翼
+##耀
+##老
+##考
+##耄
+##者
+##耆
+##耋
+##而
+##耍
+##耐
+##耒
+##耕
+##耗
+##耘
+##耙
+##耦
+##耨
+##耳
+##耶
+##耷
+##耸
+##耻
+##耽
+##耿
+##聂
+##聆
+##聊
+##聋
+##职
+##聒
+##联
+##聖
+##聘
+##聚
+##聞
+##聪
+##聯
+##聰
+##聲
+##聳
+##聴
+##聶
+##職
+##聽
+##聾
+##聿
+##肃
+##肄
+##肅
+##肆
+##肇
+##肉
+##肋
+##肌
+##肏
+##肓
+##肖
+##肘
+##肚
+##肛
+##肝
+##肠
+##股
+##肢
+##肤
+##肥
+##肩
+##肪
+##肮
+##肯
+##肱
+##育
+##肴
+##肺
+##肽
+##肾
+##肿
+##胀
+##胁
+##胃
+##胄
+##胆
+##背
+##胍
+##胎
+##胖
+##胚
+##胛
+##胜
+##胝
+##胞
+##胡
+##胤
+##胥
+##胧
+##胫
+##胭
+##胯
+##胰
+##胱
+##胳
+##胴
+##胶
+##胸
+##胺
+##能
+##脂
+##脅
+##脆
+##脇
+##脈
+##脉
+##脊
+##脍
+##脏
+##脐
+##脑
+##脓
+##脖
+##脘
+##脚
+##脛
+##脣
+##脩
+##脫
+##脯
+##脱
+##脲
+##脳
+##脸
+##脹
+##脾
+##腆
+##腈
+##腊
+##腋
+##腌
+##腎
+##腐
+##腑
+##腓
+##腔
+##腕
+##腥
+##腦
+##腩
+##腫
+##腭
+##腮
+##腰
+##腱
+##腳
+##腴
+##腸
+##腹
+##腺
+##腻
+##腼
+##腾
+##腿
+##膀
+##膈
+##膊
+##膏
+##膑
+##膘
+##膚
+##膛
+##膜
+##膝
+##膠
+##膦
+##膨
+##膩
+##膳
+##膺
+##膻
+##膽
+##膾
+##膿
+##臀
+##臂
+##臃
+##臆
+##臉
+##臊
+##臍
+##臓
+##臘
+##臟
+##臣
+##臥
+##臧
+##臨
+##自
+##臬
+##臭
+##至
+##致
+##臺
+##臻
+##臼
+##臾
+##舀
+##舂
+##舅
+##舆
+##與
+##興
+##舉
+##舊
+##舌
+##舍
+##舎
+##舐
+##舒
+##舔
+##舖
+##舗
+##舛
+##舜
+##舞
+##舟
+##航
+##舫
+##般
+##舰
+##舱
+##舵
+##舶
+##舷
+##舸
+##船
+##舺
+##舾
+##艇
+##艋
+##艘
+##艙
+##艦
+##艮
+##良
+##艰
+##艱
+##色
+##艳
+##艷
+##艹
+##艺
+##艾
+##节
+##芃
+##芈
+##芊
+##芋
+##芍
+##芎
+##芒
+##芙
+##芜
+##芝
+##芡
+##芥
+##芦
+##芩
+##芪
+##芫
+##芬
+##芭
+##芮
+##芯
+##花
+##芳
+##芷
+##芸
+##芹
+##芻
+##芽
+##芾
+##苁
+##苄
+##苇
+##苋
+##苍
+##苏
+##苑
+##苒
+##苓
+##苔
+##苕
+##苗
+##苛
+##苜
+##苞
+##苟
+##苡
+##苣
+##若
+##苦
+##苫
+##苯
+##英
+##苷
+##苹
+##苻
+##茁
+##茂
+##范
+##茄
+##茅
+##茉
+##茎
+##茏
+##茗
+##茜
+##茧
+##茨
+##茫
+##茬
+##茭
+##茯
+##茱
+##茲
+##茴
+##茵
+##茶
+##茸
+##茹
+##茼
+##荀
+##荃
+##荆
+##草
+##荊
+##荏
+##荐
+##荒
+##荔
+##荖
+##荘
+##荚
+##荞
+##荟
+##荠
+##荡
+##荣
+##荤
+##荥
+##荧
+##荨
+##荪
+##荫
+##药
+##荳
+##荷
+##荸
+##荻
+##荼
+##荽
+##莅
+##莆
+##莉
+##莊
+##莎
+##莒
+##莓
+##莖
+##莘
+##莞
+##莠
+##莢
+##莧
+##莪
+##莫
+##莱
+##莲
+##莴
+##获
+##莹
+##莺
+##莽
+##莿
+##菀
+##菁
+##菅
+##菇
+##菈
+##菊
+##菌
+##菏
+##菓
+##菖
+##菘
+##菜
+##菟
+##菠
+##菡
+##菩
+##華
+##菱
+##菲
+##菸
+##菽
+##萁
+##萃
+##萄
+##萊
+##萋
+##萌
+##萍
+##萎
+##萘
+##萝
+##萤
+##营
+##萦
+##萧
+##萨
+##萩
+##萬
+##萱
+##萵
+##萸
+##萼
+##落
+##葆
+##葉
+##著
+##葚
+##葛
+##葡
+##董
+##葦
+##葩
+##葫
+##葬
+##葭
+##葯
+##葱
+##葳
+##葵
+##葷
+##葺
+##蒂
+##蒋
+##蒐
+##蒔
+##蒙
+##蒜
+##蒞
+##蒟
+##蒡
+##蒨
+##蒲
+##蒸
+##蒹
+##蒻
+##蒼
+##蒿
+##蓁
+##蓄
+##蓆
+##蓉
+##蓋
+##蓑
+##蓓
+##蓖
+##蓝
+##蓟
+##蓦
+##蓬
+##蓮
+##蓼
+##蓿
+##蔑
+##蔓
+##蔔
+##蔗
+##蔘
+##蔚
+##蔡
+##蔣
+##蔥
+##蔫
+##蔬
+##蔭
+##蔵
+##蔷
+##蔺
+##蔻
+##蔼
+##蔽
+##蕁
+##蕃
+##蕈
+##蕉
+##蕊
+##蕎
+##蕙
+##蕤
+##蕨
+##蕩
+##蕪
+##蕭
+##蕲
+##蕴
+##蕻
+##蕾
+##薄
+##薅
+##薇
+##薈
+##薊
+##薏
+##薑
+##薔
+##薙
+##薛
+##薦
+##薨
+##薩
+##薪
+##薬
+##薯
+##薰
+##薹
+##藉
+##藍
+##藏
+##藐
+##藓
+##藕
+##藜
+##藝
+##藤
+##藥
+##藩
+##藹
+##藻
+##藿
+##蘆
+##蘇
+##蘊
+##蘋
+##蘑
+##蘚
+##蘭
+##蘸
+##蘼
+##蘿
+##虎
+##虏
+##虐
+##虑
+##虔
+##處
+##虚
+##虛
+##虜
+##虞
+##號
+##虢
+##虧
+##虫
+##虬
+##虱
+##虹
+##虻
+##虽
+##虾
+##蚀
+##蚁
+##蚂
+##蚊
+##蚌
+##蚓
+##蚕
+##蚜
+##蚝
+##蚣
+##蚤
+##蚩
+##蚪
+##蚯
+##蚱
+##蚵
+##蛀
+##蛆
+##蛇
+##蛊
+##蛋
+##蛎
+##蛐
+##蛔
+##蛙
+##蛛
+##蛟
+##蛤
+##蛭
+##蛮
+##蛰
+##蛳
+##蛹
+##蛻
+##蛾
+##蜀
+##蜂
+##蜃
+##蜆
+##蜇
+##蜈
+##蜊
+##蜍
+##蜒
+##蜓
+##蜕
+##蜗
+##蜘
+##蜚
+##蜜
+##蜡
+##蜢
+##蜥
+##蜱
+##蜴
+##蜷
+##蜻
+##蜿
+##蝇
+##蝈
+##蝉
+##蝌
+##蝎
+##蝕
+##蝗
+##蝙
+##蝟
+##蝠
+##蝦
+##蝨
+##蝴
+##蝶
+##蝸
+##蝼
+##螂
+##螃
+##融
+##螞
+##螢
+##螨
+##螯
+##螳
+##螺
+##蟀
+##蟄
+##蟆
+##蟋
+##蟎
+##蟑
+##蟒
+##蟠
+##蟬
+##蟲
+##蟹
+##蟻
+##蟾
+##蠅
+##蠍
+##蠔
+##蠕
+##蠛
+##蠟
+##蠡
+##蠢
+##蠣
+##蠱
+##蠶
+##蠹
+##蠻
+##血
+##衄
+##衅
+##衆
+##行
+##衍
+##術
+##衔
+##街
+##衙
+##衛
+##衝
+##衞
+##衡
+##衢
+##衣
+##补
+##表
+##衩
+##衫
+##衬
+##衮
+##衰
+##衲
+##衷
+##衹
+##衾
+##衿
+##袁
+##袂
+##袄
+##袅
+##袈
+##袋
+##袍
+##袒
+##袖
+##袜
+##袞
+##袤
+##袪
+##被
+##袭
+##袱
+##裁
+##裂
+##装
+##裆
+##裊
+##裏
+##裔
+##裕
+##裘
+##裙
+##補
+##裝
+##裟
+##裡
+##裤
+##裨
+##裱
+##裳
+##裴
+##裸
+##裹
+##製
+##裾
+##褂
+##複
+##褐
+##褒
+##褓
+##褔
+##褚
+##褥
+##褪
+##褫
+##褲
+##褶
+##褻
+##襁
+##襄
+##襟
+##襠
+##襪
+##襬
+##襯
+##襲
+##西
+##要
+##覃
+##覆
+##覇
+##見
+##規
+##覓
+##視
+##覚
+##覦
+##覧
+##親
+##覬
+##観
+##覷
+##覺
+##覽
+##觀
+##见
+##观
+##规
+##觅
+##视
+##览
+##觉
+##觊
+##觎
+##觐
+##觑
+##角
+##觞
+##解
+##觥
+##触
+##觸
+##言
+##訂
+##計
+##訊
+##討
+##訓
+##訕
+##訖
+##託
+##記
+##訛
+##訝
+##訟
+##訣
+##訥
+##訪
+##設
+##許
+##訳
+##訴
+##訶
+##診
+##註
+##証
+##詆
+##詐
+##詔
+##評
+##詛
+##詞
+##詠
+##詡
+##詢
+##詣
+##試
+##詩
+##詫
+##詬
+##詭
+##詮
+##詰
+##話
+##該
+##詳
+##詹
+##詼
+##誅
+##誇
+##誉
+##誌
+##認
+##誓
+##誕
+##誘
+##語
+##誠
+##誡
+##誣
+##誤
+##誥
+##誦
+##誨
+##說
+##説
+##読
+##誰
+##課
+##誹
+##誼
+##調
+##諄
+##談
+##請
+##諏
+##諒
+##論
+##諗
+##諜
+##諡
+##諦
+##諧
+##諫
+##諭
+##諮
+##諱
+##諳
+##諷
+##諸
+##諺
+##諾
+##謀
+##謁
+##謂
+##謄
+##謊
+##謎
+##謐
+##謔
+##謗
+##謙
+##講
+##謝
+##謠
+##謨
+##謬
+##謹
+##謾
+##譁
+##證
+##譎
+##譏
+##識
+##譙
+##譚
+##譜
+##警
+##譬
+##譯
+##議
+##譲
+##譴
+##護
+##譽
+##讀
+##變
+##讓
+##讚
+##讞
+##计
+##订
+##认
+##讥
+##讧
+##讨
+##让
+##讪
+##讫
+##训
+##议
+##讯
+##记
+##讲
+##讳
+##讴
+##讶
+##讷
+##许
+##讹
+##论
+##讼
+##讽
+##设
+##访
+##诀
+##证
+##诃
+##评
+##诅
+##识
+##诈
+##诉
+##诊
+##诋
+##词
+##诏
+##译
+##试
+##诗
+##诘
+##诙
+##诚
+##诛
+##话
+##诞
+##诟
+##诠
+##诡
+##询
+##诣
+##诤
+##该
+##详
+##诧
+##诩
+##诫
+##诬
+##语
+##误
+##诰
+##诱
+##诲
+##说
+##诵
+##诶
+##请
+##诸
+##诺
+##读
+##诽
+##课
+##诿
+##谀
+##谁
+##调
+##谄
+##谅
+##谆
+##谈
+##谊
+##谋
+##谌
+##谍
+##谎
+##谏
+##谐
+##谑
+##谒
+##谓
+##谔
+##谕
+##谗
+##谘
+##谙
+##谚
+##谛
+##谜
+##谟
+##谢
+##谣
+##谤
+##谥
+##谦
+##谧
+##谨
+##谩
+##谪
+##谬
+##谭
+##谯
+##谱
+##谲
+##谴
+##谶
+##谷
+##豁
+##豆
+##豇
+##豈
+##豉
+##豊
+##豌
+##豎
+##豐
+##豔
+##豚
+##象
+##豢
+##豪
+##豫
+##豬
+##豹
+##豺
+##貂
+##貅
+##貌
+##貓
+##貔
+##貘
+##貝
+##貞
+##負
+##財
+##貢
+##貧
+##貨
+##販
+##貪
+##貫
+##責
+##貯
+##貰
+##貳
+##貴
+##貶
+##買
+##貸
+##費
+##貼
+##貽
+##貿
+##賀
+##賁
+##賂
+##賃
+##賄
+##資
+##賈
+##賊
+##賑
+##賓
+##賜
+##賞
+##賠
+##賡
+##賢
+##賣
+##賤
+##賦
+##質
+##賬
+##賭
+##賴
+##賺
+##購
+##賽
+##贅
+##贈
+##贊
+##贍
+##贏
+##贓
+##贖
+##贛
+##贝
+##贞
+##负
+##贡
+##财
+##责
+##贤
+##败
+##账
+##货
+##质
+##贩
+##贪
+##贫
+##贬
+##购
+##贮
+##贯
+##贰
+##贱
+##贲
+##贴
+##贵
+##贷
+##贸
+##费
+##贺
+##贻
+##贼
+##贾
+##贿
+##赁
+##赂
+##赃
+##资
+##赅
+##赈
+##赊
+##赋
+##赌
+##赎
+##赏
+##赐
+##赓
+##赔
+##赖
+##赘
+##赚
+##赛
+##赝
+##赞
+##赠
+##赡
+##赢
+##赣
+##赤
+##赦
+##赧
+##赫
+##赭
+##走
+##赳
+##赴
+##赵
+##赶
+##起
+##趁
+##超
+##越
+##趋
+##趕
+##趙
+##趟
+##趣
+##趨
+##足
+##趴
+##趵
+##趸
+##趺
+##趾
+##跃
+##跄
+##跆
+##跋
+##跌
+##跎
+##跑
+##跖
+##跚
+##跛
+##距
+##跟
+##跡
+##跤
+##跨
+##跩
+##跪
+##路
+##跳
+##践
+##跷
+##跹
+##跺
+##跻
+##踉
+##踊
+##踌
+##踏
+##踐
+##踝
+##踞
+##踟
+##踢
+##踩
+##踪
+##踮
+##踱
+##踴
+##踵
+##踹
+##蹂
+##蹄
+##蹇
+##蹈
+##蹉
+##蹊
+##蹋
+##蹑
+##蹒
+##蹙
+##蹟
+##蹣
+##蹤
+##蹦
+##蹩
+##蹬
+##蹭
+##蹲
+##蹴
+##蹶
+##蹺
+##蹼
+##蹿
+##躁
+##躇
+##躉
+##躊
+##躋
+##躍
+##躏
+##躪
+##身
+##躬
+##躯
+##躲
+##躺
+##軀
+##車
+##軋
+##軌
+##軍
+##軒
+##軟
+##転
+##軸
+##軼
+##軽
+##軾
+##較
+##載
+##輒
+##輓
+##輔
+##輕
+##輛
+##輝
+##輟
+##輩
+##輪
+##輯
+##輸
+##輻
+##輾
+##輿
+##轄
+##轅
+##轆
+##轉
+##轍
+##轎
+##轟
+##车
+##轧
+##轨
+##轩
+##转
+##轭
+##轮
+##软
+##轰
+##轲
+##轴
+##轶
+##轻
+##轼
+##载
+##轿
+##较
+##辄
+##辅
+##辆
+##辇
+##辈
+##辉
+##辊
+##辍
+##辐
+##辑
+##输
+##辕
+##辖
+##辗
+##辘
+##辙
+##辛
+##辜
+##辞
+##辟
+##辣
+##辦
+##辨
+##辩
+##辫
+##辭
+##辮
+##辯
+##辰
+##辱
+##農
+##边
+##辺
+##辻
+##込
+##辽
+##达
+##迁
+##迂
+##迄
+##迅
+##过
+##迈
+##迎
+##运
+##近
+##返
+##还
+##这
+##进
+##远
+##违
+##连
+##迟
+##迢
+##迤
+##迥
+##迦
+##迩
+##迪
+##迫
+##迭
+##述
+##迴
+##迷
+##迸
+##迹
+##迺
+##追
+##退
+##送
+##适
+##逃
+##逅
+##逆
+##选
+##逊
+##逍
+##透
+##逐
+##递
+##途
+##逕
+##逗
+##這
+##通
+##逛
+##逝
+##逞
+##速
+##造
+##逢
+##連
+##逮
+##週
+##進
+##逵
+##逶
+##逸
+##逻
+##逼
+##逾
+##遁
+##遂
+##遅
+##遇
+##遊
+##運
+##遍
+##過
+##遏
+##遐
+##遑
+##遒
+##道
+##達
+##違
+##遗
+##遙
+##遛
+##遜
+##遞
+##遠
+##遢
+##遣
+##遥
+##遨
+##適
+##遭
+##遮
+##遲
+##遴
+##遵
+##遶
+##遷
+##選
+##遺
+##遼
+##遽
+##避
+##邀
+##邁
+##邂
+##邃
+##還
+##邇
+##邈
+##邊
+##邋
+##邏
+##邑
+##邓
+##邕
+##邛
+##邝
+##邢
+##那
+##邦
+##邨
+##邪
+##邬
+##邮
+##邯
+##邰
+##邱
+##邳
+##邵
+##邸
+##邹
+##邺
+##邻
+##郁
+##郅
+##郊
+##郎
+##郑
+##郜
+##郝
+##郡
+##郢
+##郤
+##郦
+##郧
+##部
+##郫
+##郭
+##郴
+##郵
+##郷
+##郸
+##都
+##鄂
+##鄉
+##鄒
+##鄔
+##鄙
+##鄞
+##鄢
+##鄧
+##鄭
+##鄰
+##鄱
+##鄲
+##鄺
+##酉
+##酊
+##酋
+##酌
+##配
+##酐
+##酒
+##酗
+##酚
+##酝
+##酢
+##酣
+##酥
+##酩
+##酪
+##酬
+##酮
+##酯
+##酰
+##酱
+##酵
+##酶
+##酷
+##酸
+##酿
+##醃
+##醇
+##醉
+##醋
+##醍
+##醐
+##醒
+##醚
+##醛
+##醜
+##醞
+##醣
+##醪
+##醫
+##醬
+##醮
+##醯
+##醴
+##醺
+##釀
+##釁
+##采
+##釉
+##释
+##釋
+##里
+##重
+##野
+##量
+##釐
+##金
+##釗
+##釘
+##釜
+##針
+##釣
+##釦
+##釧
+##釵
+##鈀
+##鈉
+##鈍
+##鈎
+##鈔
+##鈕
+##鈞
+##鈣
+##鈦
+##鈪
+##鈴
+##鈺
+##鈾
+##鉀
+##鉄
+##鉅
+##鉉
+##鉑
+##鉗
+##鉚
+##鉛
+##鉤
+##鉴
+##鉻
+##銀
+##銃
+##銅
+##銑
+##銓
+##銖
+##銘
+##銜
+##銬
+##銭
+##銮
+##銳
+##銷
+##銹
+##鋁
+##鋅
+##鋒
+##鋤
+##鋪
+##鋰
+##鋸
+##鋼
+##錄
+##錐
+##錘
+##錚
+##錠
+##錢
+##錦
+##錨
+##錫
+##錮
+##錯
+##録
+##錳
+##錶
+##鍊
+##鍋
+##鍍
+##鍛
+##鍥
+##鍰
+##鍵
+##鍺
+##鍾
+##鎂
+##鎊
+##鎌
+##鎏
+##鎔
+##鎖
+##鎗
+##鎚
+##鎧
+##鎬
+##鎮
+##鎳
+##鏈
+##鏖
+##鏗
+##鏘
+##鏞
+##鏟
+##鏡
+##鏢
+##鏤
+##鏽
+##鐘
+##鐮
+##鐲
+##鐳
+##鐵
+##鐸
+##鐺
+##鑄
+##鑊
+##鑑
+##鑒
+##鑣
+##鑫
+##鑰
+##鑲
+##鑼
+##鑽
+##鑾
+##鑿
+##针
+##钉
+##钊
+##钎
+##钏
+##钒
+##钓
+##钗
+##钙
+##钛
+##钜
+##钝
+##钞
+##钟
+##钠
+##钡
+##钢
+##钣
+##钤
+##钥
+##钦
+##钧
+##钨
+##钩
+##钮
+##钯
+##钰
+##钱
+##钳
+##钴
+##钵
+##钺
+##钻
+##钼
+##钾
+##钿
+##铀
+##铁
+##铂
+##铃
+##铄
+##铅
+##铆
+##铉
+##铎
+##铐
+##铛
+##铜
+##铝
+##铠
+##铡
+##铢
+##铣
+##铤
+##铨
+##铩
+##铬
+##铭
+##铮
+##铰
+##铲
+##铵
+##银
+##铸
+##铺
+##链
+##铿
+##销
+##锁
+##锂
+##锄
+##锅
+##锆
+##锈
+##锉
+##锋
+##锌
+##锏
+##锐
+##锑
+##错
+##锚
+##锟
+##锡
+##锢
+##锣
+##锤
+##锥
+##锦
+##锭
+##键
+##锯
+##锰
+##锲
+##锵
+##锹
+##锺
+##锻
+##镀
+##镁
+##镂
+##镇
+##镉
+##镌
+##镍
+##镐
+##镑
+##镕
+##镖
+##镗
+##镛
+##镜
+##镣
+##镭
+##镯
+##镰
+##镳
+##镶
+##長
+##长
+##門
+##閃
+##閉
+##開
+##閎
+##閏
+##閑
+##閒
+##間
+##閔
+##閘
+##閡
+##関
+##閣
+##閥
+##閨
+##閩
+##閱
+##閲
+##閹
+##閻
+##閾
+##闆
+##闇
+##闊
+##闌
+##闍
+##闔
+##闕
+##闖
+##闘
+##關
+##闡
+##闢
+##门
+##闪
+##闫
+##闭
+##问
+##闯
+##闰
+##闲
+##间
+##闵
+##闷
+##闸
+##闹
+##闺
+##闻
+##闽
+##闾
+##阀
+##阁
+##阂
+##阅
+##阆
+##阇
+##阈
+##阉
+##阎
+##阐
+##阑
+##阔
+##阕
+##阖
+##阙
+##阚
+##阜
+##队
+##阡
+##阪
+##阮
+##阱
+##防
+##阳
+##阴
+##阵
+##阶
+##阻
+##阿
+##陀
+##陂
+##附
+##际
+##陆
+##陇
+##陈
+##陋
+##陌
+##降
+##限
+##陕
+##陛
+##陝
+##陞
+##陟
+##陡
+##院
+##陣
+##除
+##陨
+##险
+##陪
+##陰
+##陲
+##陳
+##陵
+##陶
+##陷
+##陸
+##険
+##陽
+##隅
+##隆
+##隈
+##隊
+##隋
+##隍
+##階
+##随
+##隐
+##隔
+##隕
+##隘
+##隙
+##際
+##障
+##隠
+##隣
+##隧
+##隨
+##險
+##隱
+##隴
+##隶
+##隸
+##隻
+##隼
+##隽
+##难
+##雀
+##雁
+##雄
+##雅
+##集
+##雇
+##雉
+##雋
+##雌
+##雍
+##雎
+##雏
+##雑
+##雒
+##雕
+##雖
+##雙
+##雛
+##雜
+##雞
+##離
+##難
+##雨
+##雪
+##雯
+##雰
+##雲
+##雳
+##零
+##雷
+##雹
+##電
+##雾
+##需
+##霁
+##霄
+##霆
+##震
+##霈
+##霉
+##霊
+##霍
+##霎
+##霏
+##霑
+##霓
+##霖
+##霜
+##霞
+##霧
+##霭
+##霰
+##露
+##霸
+##霹
+##霽
+##霾
+##靂
+##靄
+##靈
+##青
+##靓
+##靖
+##静
+##靚
+##靛
+##靜
+##非
+##靠
+##靡
+##面
+##靥
+##靦
+##革
+##靳
+##靴
+##靶
+##靼
+##鞅
+##鞋
+##鞍
+##鞏
+##鞑
+##鞘
+##鞠
+##鞣
+##鞦
+##鞭
+##韆
+##韋
+##韌
+##韓
+##韜
+##韦
+##韧
+##韩
+##韬
+##韭
+##音
+##韵
+##韶
+##韻
+##響
+##頁
+##頂
+##頃
+##項
+##順
+##須
+##頌
+##預
+##頑
+##頒
+##頓
+##頗
+##領
+##頜
+##頡
+##頤
+##頫
+##頭
+##頰
+##頷
+##頸
+##頹
+##頻
+##頼
+##顆
+##題
+##額
+##顎
+##顏
+##顔
+##願
+##顛
+##類
+##顧
+##顫
+##顯
+##顱
+##顴
+##页
+##顶
+##顷
+##项
+##顺
+##须
+##顼
+##顽
+##顾
+##顿
+##颁
+##颂
+##预
+##颅
+##领
+##颇
+##颈
+##颉
+##颊
+##颌
+##颍
+##颐
+##频
+##颓
+##颔
+##颖
+##颗
+##题
+##颚
+##颛
+##颜
+##额
+##颞
+##颠
+##颡
+##颢
+##颤
+##颦
+##颧
+##風
+##颯
+##颱
+##颳
+##颶
+##颼
+##飄
+##飆
+##风
+##飒
+##飓
+##飕
+##飘
+##飙
+##飚
+##飛
+##飞
+##食
+##飢
+##飨
+##飩
+##飪
+##飯
+##飲
+##飼
+##飽
+##飾
+##餃
+##餅
+##餉
+##養
+##餌
+##餐
+##餒
+##餓
+##餘
+##餚
+##餛
+##餞
+##餡
+##館
+##餮
+##餵
+##餾
+##饅
+##饈
+##饋
+##饌
+##饍
+##饑
+##饒
+##饕
+##饗
+##饞
+##饥
+##饨
+##饪
+##饬
+##饭
+##饮
+##饯
+##饰
+##饱
+##饲
+##饴
+##饵
+##饶
+##饷
+##饺
+##饼
+##饽
+##饿
+##馀
+##馁
+##馄
+##馅
+##馆
+##馈
+##馋
+##馍
+##馏
+##馒
+##馔
+##首
+##馗
+##香
+##馥
+##馨
+##馬
+##馭
+##馮
+##馳
+##馴
+##駁
+##駄
+##駅
+##駆
+##駐
+##駒
+##駕
+##駛
+##駝
+##駭
+##駱
+##駿
+##騁
+##騎
+##騏
+##験
+##騙
+##騨
+##騰
+##騷
+##驀
+##驅
+##驊
+##驍
+##驒
+##驕
+##驗
+##驚
+##驛
+##驟
+##驢
+##驥
+##马
+##驭
+##驮
+##驯
+##驰
+##驱
+##驳
+##驴
+##驶
+##驷
+##驸
+##驹
+##驻
+##驼
+##驾
+##驿
+##骁
+##骂
+##骄
+##骅
+##骆
+##骇
+##骈
+##骊
+##骋
+##验
+##骏
+##骐
+##骑
+##骗
+##骚
+##骛
+##骜
+##骞
+##骠
+##骡
+##骤
+##骥
+##骧
+##骨
+##骯
+##骰
+##骶
+##骷
+##骸
+##骼
+##髂
+##髅
+##髋
+##髏
+##髒
+##髓
+##體
+##髖
+##高
+##髦
+##髪
+##髮
+##髯
+##髻
+##鬃
+##鬆
+##鬍
+##鬓
+##鬚
+##鬟
+##鬢
+##鬣
+##鬥
+##鬧
+##鬱
+##鬼
+##魁
+##魂
+##魄
+##魅
+##魇
+##魍
+##魏
+##魔
+##魘
+##魚
+##魯
+##魷
+##鮑
+##鮨
+##鮪
+##鮭
+##鮮
+##鯉
+##鯊
+##鯖
+##鯛
+##鯨
+##鯰
+##鯽
+##鰍
+##鰓
+##鰭
+##鰲
+##鰻
+##鰾
+##鱈
+##鱉
+##鱔
+##鱗
+##鱷
+##鱸
+##鱼
+##鱿
+##鲁
+##鲈
+##鲍
+##鲑
+##鲛
+##鲜
+##鲟
+##鲢
+##鲤
+##鲨
+##鲫
+##鲱
+##鲲
+##鲶
+##鲷
+##鲸
+##鳃
+##鳄
+##鳅
+##鳌
+##鳍
+##鳕
+##鳖
+##鳗
+##鳝
+##鳞
+##鳥
+##鳩
+##鳳
+##鳴
+##鳶
+##鴉
+##鴕
+##鴛
+##鴦
+##鴨
+##鴻
+##鴿
+##鵑
+##鵜
+##鵝
+##鵡
+##鵬
+##鵰
+##鵲
+##鶘
+##鶩
+##鶯
+##鶴
+##鷗
+##鷲
+##鷹
+##鷺
+##鸚
+##鸞
+##鸟
+##鸠
+##鸡
+##鸢
+##鸣
+##鸥
+##鸦
+##鸨
+##鸪
+##鸭
+##鸯
+##鸳
+##鸵
+##鸽
+##鸾
+##鸿
+##鹂
+##鹃
+##鹄
+##鹅
+##鹈
+##鹉
+##鹊
+##鹌
+##鹏
+##鹑
+##鹕
+##鹘
+##鹜
+##鹞
+##鹤
+##鹦
+##鹧
+##鹫
+##鹭
+##鹰
+##鹳
+##鹵
+##鹹
+##鹼
+##鹽
+##鹿
+##麂
+##麋
+##麒
+##麓
+##麗
+##麝
+##麟
+##麥
+##麦
+##麩
+##麴
+##麵
+##麸
+##麺
+##麻
+##麼
+##麽
+##麾
+##黃
+##黄
+##黍
+##黎
+##黏
+##黑
+##黒
+##黔
+##默
+##黛
+##黜
+##黝
+##點
+##黠
+##黨
+##黯
+##黴
+##鼋
+##鼎
+##鼐
+##鼓
+##鼠
+##鼬
+##鼹
+##鼻
+##鼾
+##齁
+##齊
+##齋
+##齐
+##齒
+##齡
+##齢
+##齣
+##齦
+##齿
+##龄
+##龅
+##龈
+##龊
+##龋
+##龌
+##龍
+##龐
+##龔
+##龕
+##龙
+##龚
+##龛
+##龜
+##龟
+##︰
+##︱
+##︶
+##︿
+##﹁
+##﹂
+##﹍
+##﹏
+##﹐
+##﹑
+##﹒
+##﹔
+##﹕
+##﹖
+##﹗
+##﹙
+##﹚
+##﹝
+##﹞
+##﹡
+##﹣
+##！
+##＂
+##＃
+##＄
+##％
+##＆
+##＇
+##（
+##）
+##＊
+##，
+##－
+##．
+##／
+##：
+##；
+##＜
+##？
+##＠
+##［
+##＼
+##］
+##＾
+##＿
+##｀
+##ｆ
+##ｈ
+##ｊ
+##ｕ
+##ｗ
+##ｚ
+##｛
+##｝
+##｡
+##｢
+##｣
+##､
+##･
+##ｯ
+##ｰ
+##ｲ
+##ｸ
+##ｼ
+##ｽ
+##ﾄ
+##ﾉ
+##ﾌ
+##ﾗ
+##ﾙ
+##ﾝ
+##ﾞ
+##ﾟ
+##￣
+##￥
+##👍
+##🔥
+##😂
+##😎
diff --git a/fengshen/workspace/readme.md b/fengshen/workspace/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..ddeb5b856454074f86c9c3d079377cc58859897f
--- /dev/null
+++ b/fengshen/workspace/readme.md
@@ -0,0 +1,3 @@
+# Readme
+
+这个目录主要用来存放训练中产生的日志文件、Checkpoint，以及一些examples初始化时需要的配置文件。