calico-1226
commited on
Commit
•
5e0db46
1
Parent(s):
4bac9a2
Upload model
Browse files- .gitignore +1 -0
- arguments.json +53 -0
- arguments.pkl +3 -0
- config.json +30 -0
- environ.txt +119 -0
- latest +1 -0
- pytorch_model.bin +3 -0
- script.sh +132 -0
- special_tokens_map.json +30 -0
- stderr.log +0 -0
- stdout.log +64 -0
- tokenizer.json +0 -0
- tokenizer_config.json +50 -0
- zero_to_fp32.py +604 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*
|
arguments.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name_or_path": "models/alpaca-7b-reproduced",
|
3 |
+
"max_length": 1024,
|
4 |
+
"trust_remote_code": true,
|
5 |
+
"train_datasets": [
|
6 |
+
[
|
7 |
+
"alpaca",
|
8 |
+
{
|
9 |
+
"proportion": 1.0
|
10 |
+
}
|
11 |
+
]
|
12 |
+
],
|
13 |
+
"eval_datasets": null,
|
14 |
+
"epochs": 3,
|
15 |
+
"per_device_train_batch_size": 8,
|
16 |
+
"per_device_eval_batch_size": 8,
|
17 |
+
"gradient_accumulation_steps": 8,
|
18 |
+
"gradient_checkpointing": true,
|
19 |
+
"lr": 2e-05,
|
20 |
+
"lr_scheduler_type": "cosine",
|
21 |
+
"lr_warmup_ratio": 0.03,
|
22 |
+
"weight_decay": 0.0,
|
23 |
+
"seed": 42,
|
24 |
+
"fp16": false,
|
25 |
+
"bf16": true,
|
26 |
+
"tf32": true,
|
27 |
+
"eval_strategy": "epoch",
|
28 |
+
"eval_interval": 1000000,
|
29 |
+
"need_eval": false,
|
30 |
+
"eval_split_ratio": null,
|
31 |
+
"output_dir": "/home/juntao/Projects/roo/models/alpaca-7b-sft",
|
32 |
+
"log_type": "wandb",
|
33 |
+
"log_dir": "/home/juntao/Projects/roo/models/alpaca-7b-sft",
|
34 |
+
"log_project": "SFT-alpaca",
|
35 |
+
"log_group": null,
|
36 |
+
"log_run_name": "sft-2024-09-04-12-52-34",
|
37 |
+
"save_16bit": false,
|
38 |
+
"save_interval": 1000000,
|
39 |
+
"local_rank": 0,
|
40 |
+
"zero_stage": 3,
|
41 |
+
"offload": "none",
|
42 |
+
"deepspeed": false,
|
43 |
+
"deepspeed_config": null,
|
44 |
+
"deepscale": false,
|
45 |
+
"deepscale_config": null,
|
46 |
+
"global_rank": 0,
|
47 |
+
"device": {
|
48 |
+
"type": "torch.device",
|
49 |
+
"repr": "device(type='cuda', index=0)"
|
50 |
+
},
|
51 |
+
"num_update_steps_per_epoch": 102,
|
52 |
+
"total_training_steps": 306
|
53 |
+
}
|
arguments.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:028bdcd52461fcb6f24d65acdb4ba31a8f67b61a7328cfd75cefb41a300b1203
|
3 |
+
size 1030
|
config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "models/alpaca-7b-reproduced",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 4096,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 11008,
|
14 |
+
"max_position_embeddings": 2048,
|
15 |
+
"mlp_bias": false,
|
16 |
+
"model_type": "llama",
|
17 |
+
"num_attention_heads": 32,
|
18 |
+
"num_hidden_layers": 32,
|
19 |
+
"num_key_value_heads": 32,
|
20 |
+
"pad_token_id": 32000,
|
21 |
+
"pretraining_tp": 1,
|
22 |
+
"rms_norm_eps": 1e-06,
|
23 |
+
"rope_scaling": null,
|
24 |
+
"rope_theta": 10000.0,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.41.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"vocab_size": 32001
|
30 |
+
}
|
environ.txt
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
BAT_THEME=Monokai Extended
|
2 |
+
BROWSER=/home/juntao/.vscode-server/cli/servers/Stable-fee1edb8d6d72a0ddff41e5f71a671c23ed924b9/server/bin/helpers/browser.sh
|
3 |
+
CC=/usr/bin/gcc
|
4 |
+
COLORTERM=truecolor
|
5 |
+
CONDA_DEFAULT_ENV=roo
|
6 |
+
CONDA_EXE=/home/juntao/Miniconda3/bin/conda
|
7 |
+
CONDA_PREFIX=/home/juntao/Miniconda3/envs/roo
|
8 |
+
CONDA_PROMPT_MODIFIER=(roo)
|
9 |
+
CONDA_PYTHONBREAKPOINT=
|
10 |
+
CONDA_PYTHON_EXE=/home/juntao/Miniconda3/bin/python
|
11 |
+
CONDA_ROOT=/home/juntao/Miniconda3
|
12 |
+
CONDA_SHLVL=1
|
13 |
+
CPLUS_INCLUDE_PATH=/usr/local/cuda/include:/usr/local/cuda/extras/CUPTI/include
|
14 |
+
CROSS_RANK=0
|
15 |
+
CROSS_SIZE=1
|
16 |
+
CUDA_HOME=/usr/local/cuda
|
17 |
+
CUDA_MODULE_LOADING=LAZY
|
18 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
19 |
+
CXX=/usr/bin/g++
|
20 |
+
C_INCLUDE_PATH=/usr/local/cuda/include:/usr/local/cuda/extras/CUPTI/include
|
21 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1001/bus
|
22 |
+
FC=/usr/bin/gfortran
|
23 |
+
FPATH=/home/juntao/.oh-my-zsh/plugins/vscode:/home/juntao/.oh-my-zsh/plugins/brew:/home/juntao/.oh-my-zsh/plugins/tmux:/home/juntao/.oh-my-zsh/plugins/docker:/home/juntao/.oh-my-zsh/plugins/pylint:/home/juntao/.oh-my-zsh/plugins/pip:/home/juntao/.oh-my-zsh/plugins/python:/home/juntao/.oh-my-zsh/plugins/git-auto-fetch:/home/juntao/.oh-my-zsh/plugins/git:/home/juntao/.oh-my-zsh/plugins/alias-finder:/home/juntao/.oh-my-zsh/plugins/rsync:/home/juntao/.oh-my-zsh/plugins/cp:/home/juntao/.oh-my-zsh/plugins/copypath:/home/juntao/.oh-my-zsh/plugins/copyfile:/home/juntao/.oh-my-zsh/plugins/fzf:/home/juntao/.oh-my-zsh/plugins/colored-man-pages:/home/juntao/.oh-my-zsh/plugins/colorize:/home/juntao/.oh-my-zsh/custom/plugins/conda-zsh-completion:/home/juntao/.oh-my-zsh/custom/plugins/zsh-completions:/home/juntao/.oh-my-zsh/custom/plugins/zsh-autosuggestions:/home/juntao/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting:/home/juntao/.oh-my-zsh/plugins/ubuntu:/home/juntao/.oh-my-zsh/functions:/home/juntao/.oh-my-zsh/completions:/home/juntao/.oh-my-zsh/custom/functions:/home/juntao/.oh-my-zsh/custom/completions:/home/linuxbrew/.linuxbrew/share/zsh/site-functions:/home/juntao/.oh-my-zsh/plugins/vscode:/home/juntao/.oh-my-zsh/plugins/brew:/home/juntao/.oh-my-zsh/plugins/tmux:/home/juntao/.oh-my-zsh/plugins/docker:/home/juntao/.oh-my-zsh/plugins/pylint:/home/juntao/.oh-my-zsh/plugins/pip:/home/juntao/.oh-my-zsh/plugins/python:/home/juntao/.oh-my-zsh/plugins/git-auto-fetch:/home/juntao/.oh-my-zsh/plugins/git:/home/juntao/.oh-my-zsh/plugins/alias-finder:/home/juntao/.oh-my-zsh/plugins/rsync:/home/juntao/.oh-my-zsh/plugins/cp:/home/juntao/.oh-my-zsh/plugins/copypath:/home/juntao/.oh-my-zsh/plugins/copyfile:/home/juntao/.oh-my-zsh/plugins/fzf:/home/juntao/.oh-my-zsh/plugins/colored-man-pages:/home/juntao/.oh-my-zsh/plugins/colorize:/home/juntao/.oh-my-zsh/custom/plugins/conda-zsh-completion:/home/juntao/.oh-my-zsh/custom/plugins/zsh-completions:/home/juntao/.oh-my-zsh/custom/plugins/zsh-autosuggestions:/home/juntao/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting:/home/juntao/.oh-my-zsh/plugins/ubuntu:/home/juntao/.oh-my-zsh/functions:/home/juntao/.oh-my-zsh/completions:/home/juntao/.oh-my-zsh/custom/functions:/home/juntao/.oh-my-zsh/custom/completions:/home/juntao/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle:/home/linuxbrew/.linuxbrew/share/zsh/functions:/home/juntao/.oh-my-zsh/custom/plugins/zsh-completions/src:/home/juntao/.oh-my-zsh/custom/plugins/zsh-completions/src:/home/juntao/.oh-my-zsh/custom/plugins/conda-zsh-completion:/home/linuxbrew/.linuxbrew/share/zsh/site-functions
|
24 |
+
FZF_CTRL_T_COMMAND=fdfind --type file --follow --hidden --no-ignore-vcs --exclude '.git' --exclude '[Mm]iniconda3' --exclude '[Aa]naconda3' --color=always
|
25 |
+
FZF_DEFAULT_COMMAND=fdfind --type file --follow --hidden --no-ignore-vcs --exclude '.git' --exclude '[Mm]iniconda3' --exclude '[Aa]naconda3' --color=always
|
26 |
+
FZF_DEFAULT_OPTS=--height=40% --layout=reverse --ansi --preview='(batcat --color=always {} || highlight -O ansi {} || cat {}) 2>/dev/null | head -100'
|
27 |
+
GIT_ASKPASS=/home/juntao/.vscode-server/cli/servers/Stable-fee1edb8d6d72a0ddff41e5f71a671c23ed924b9/server/extensions/git/dist/askpass.sh
|
28 |
+
GSETTINGS_SCHEMA_DIR=/home/juntao/Miniconda3/envs/roo/share/glib-2.0/schemas
|
29 |
+
GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=
|
30 |
+
HOME=/home/juntao
|
31 |
+
HOMEBREW_API_DOMAIN=https://mirrors.tuna.tsinghua.edu.cn/homebrew-bottles/api
|
32 |
+
HOMEBREW_BAT=true
|
33 |
+
HOMEBREW_BOTTLE_DOMAIN=https://mirrors.tuna.tsinghua.edu.cn/homebrew-bottles
|
34 |
+
HOMEBREW_BREW_GIT_REMOTE=https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git
|
35 |
+
HOMEBREW_CELLAR=/home/linuxbrew/.linuxbrew/Cellar
|
36 |
+
HOMEBREW_CORE_GIT_REMOTE=https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/homebrew-core.git
|
37 |
+
HOMEBREW_EDITOR=vim
|
38 |
+
HOMEBREW_PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
|
39 |
+
HOMEBREW_PREFIX=/home/linuxbrew/.linuxbrew
|
40 |
+
HOMEBREW_REPOSITORY=/home/linuxbrew/.linuxbrew/Homebrew
|
41 |
+
INFOPATH=/home/linuxbrew/.linuxbrew/share/info:/home/linuxbrew/.linuxbrew/share/info:/home/linuxbrew/.linuxbrew/share/info:
|
42 |
+
LANG=en_US.UTF-8
|
43 |
+
LC_ALL=en_US.utf8
|
44 |
+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
|
45 |
+
LESS=-R -M -i -j5
|
46 |
+
LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
|
47 |
+
LOCAL_RANK=0
|
48 |
+
LOCAL_SIZE=8
|
49 |
+
LOGLEVEL=WARNING
|
50 |
+
LOGNAME=juntao
|
51 |
+
LSCOLORS=Gxfxcxdxbxegedabagacad
|
52 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
53 |
+
MASTER_ADDR=127.0.0.1
|
54 |
+
MASTER_PORT=15892
|
55 |
+
MOTD_SHOWN=pam
|
56 |
+
MPICH_CC=/usr/bin/gcc
|
57 |
+
MPICH_CXX=/usr/bin/g++
|
58 |
+
MPICH_FC=/usr/bin/gfortran
|
59 |
+
OLDPWD=/home/juntao/Projects/roo
|
60 |
+
OMPI_CC=/usr/bin/gcc
|
61 |
+
OMPI_CXX=/usr/bin/g++
|
62 |
+
OMPI_FC=/usr/bin/gfortran
|
63 |
+
P10K_LEAN_STYLE=true
|
64 |
+
P9K_SSH=1
|
65 |
+
P9K_TTY=old
|
66 |
+
PAGER=less
|
67 |
+
PATH=/home/juntao/Miniconda3/envs/roo/bin:/home/juntao/.gem/ruby/2.7.0/bin:/var/lib/gems/2.7.0/bin:/usr/local/cuda/bin:/home/linuxbrew/.linuxbrew/bin:/home/linuxbrew/.linuxbrew/sbin:/home/juntao/.vscode-server/cli/servers/Stable-fee1edb8d6d72a0ddff41e5f71a671c23ed924b9/server/bin/remote-cli:/home/juntao/.perl/bin:/home/juntao/Miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/juntao/.fzf/bin
|
68 |
+
PERL5LIB=/home/juntao/.perl/lib/perl5
|
69 |
+
PERL_LOCAL_LIB_ROOT=/home/juntao/.perl
|
70 |
+
PERL_MB_OPT=--install_base "/home/juntao/.perl"
|
71 |
+
PERL_MM_OPT=INSTALL_BASE=/home/juntao/.perl
|
72 |
+
PWD=/home/juntao/Projects/roo
|
73 |
+
PYTHONBREAKPOINT=ipdb.set_trace
|
74 |
+
PYTHONHASHSEED=42
|
75 |
+
PYTHONPATH=/home/juntao/Projects/roo
|
76 |
+
RANK=0
|
77 |
+
RUBYOPT=-W0
|
78 |
+
SHELL=/usr/bin/zsh
|
79 |
+
SHLVL=3
|
80 |
+
SSH_CLIENT=218.68.227.248 2030 24100
|
81 |
+
SSH_CONNECTION=218.68.227.248 2030 10.10.40.131 24100
|
82 |
+
SSL_CERT_DIR=/usr/lib/ssl/certs
|
83 |
+
SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
|
84 |
+
TERM=xterm-256color
|
85 |
+
TERM_PROGRAM=vscode
|
86 |
+
TERM_PROGRAM_VERSION=1.92.2
|
87 |
+
TMUX=/tmp/tmux-1001/default,35347,0
|
88 |
+
TMUX_CONF=/home/juntao/.tmux.conf
|
89 |
+
TMUX_CONF_LOCAL=/home/juntao/.tmux.conf.local
|
90 |
+
TMUX_PANE=%0
|
91 |
+
TMUX_PROGRAM=/usr/bin/tmux
|
92 |
+
TMUX_SOCKET=/tmp/tmux-1001/default
|
93 |
+
USER=juntao
|
94 |
+
USER_ZDOTDIR=/home/juntao
|
95 |
+
VSCODE_GIT_ASKPASS_EXTRA_ARGS=
|
96 |
+
VSCODE_GIT_ASKPASS_MAIN=/home/juntao/.vscode-server/cli/servers/Stable-fee1edb8d6d72a0ddff41e5f71a671c23ed924b9/server/extensions/git/dist/askpass-main.js
|
97 |
+
VSCODE_GIT_ASKPASS_NODE=/home/juntao/.vscode-server/cli/servers/Stable-fee1edb8d6d72a0ddff41e5f71a671c23ed924b9/server/node
|
98 |
+
VSCODE_GIT_IPC_HANDLE=/run/user/1001/vscode-git-39160f57ba.sock
|
99 |
+
VSCODE_INJECTION=1
|
100 |
+
VSCODE_IPC_HOOK_CLI=/run/user/1001/vscode-ipc-95a68742-c3c1-4fc3-926f-daa969efe302.sock
|
101 |
+
VSCODE_NONCE=b605a6a9-7032-4c22-bc1d-5f29901de2f4
|
102 |
+
WANDB_API_KEY=84ca867e702f8a930beae406c06c21ec7f8acfe3
|
103 |
+
WANDB_SERVICE=2-2131706-tcp-localhost-43909
|
104 |
+
WORLD_SIZE=8
|
105 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
106 |
+
XDG_RUNTIME_DIR=/run/user/1001
|
107 |
+
XDG_SESSION_CLASS=user
|
108 |
+
XDG_SESSION_ID=3
|
109 |
+
XDG_SESSION_TYPE=tty
|
110 |
+
ZDOTDIR=/home/juntao
|
111 |
+
ZSH=/home/juntao/.oh-my-zsh
|
112 |
+
ZSH_TMUX_CONFIG=/home/juntao/.tmux.conf
|
113 |
+
ZSH_TMUX_TERM=screen-256color
|
114 |
+
_=/home/juntao/Miniconda3/envs/roo/bin/deepspeed
|
115 |
+
_CE_CONDA=
|
116 |
+
_CE_M=
|
117 |
+
_P9K_SSH_TTY=/dev/pts/9
|
118 |
+
_P9K_TTY=/dev/pts/9
|
119 |
+
_ZSH_TMUX_FIXED_CONFIG=/home/juntao/.oh-my-zsh/plugins/tmux/tmux.extra.conf
|
latest
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
global_step304
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a4f065aa42dce062ab13ef1e6ebfcdcca533fd8dbe9c14ce30759c4a76c8b2b
|
3 |
+
size 26953792682
|
script.sh
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
#
|
3 |
+
# Copyright 2023-2024 PKU-Alignment Team. All Rights Reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# ==============================================================================
|
17 |
+
|
18 |
+
if [ -z "${BASH_VERSION}" ]; then
|
19 |
+
echo "Please use bash to run this script." >&2
|
20 |
+
exit 1
|
21 |
+
fi
|
22 |
+
|
23 |
+
set -x
|
24 |
+
|
25 |
+
SCRIPT_DIR="$(cd "$(dirname "$0")" &>/dev/null && pwd)"
|
26 |
+
ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
|
27 |
+
export PYTHONPATH="${ROOT_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
|
28 |
+
export LOGLEVEL="${LOGLEVEL:-WARNING}"
|
29 |
+
|
30 |
+
MODEL_NAME_OR_PATH="models/alpaca-7b-reproduced"
|
31 |
+
OUTPUT_DIR="models/alpaca-7b-sft"
|
32 |
+
unset HOSTFILE
|
33 |
+
ZERO_STAGE=3
|
34 |
+
OFFLOAD="none"
|
35 |
+
while [[ "$#" -gt 0 ]]; do
|
36 |
+
arg="$1"
|
37 |
+
shift
|
38 |
+
case "${arg}" in
|
39 |
+
--model_name_or_path)
|
40 |
+
MODEL_NAME_OR_PATH="$1"
|
41 |
+
shift
|
42 |
+
;;
|
43 |
+
--model_name_or_path=*)
|
44 |
+
MODEL_NAME_OR_PATH="${arg#*=}"
|
45 |
+
;;
|
46 |
+
--output_dir)
|
47 |
+
OUTPUT_DIR="$1"
|
48 |
+
shift
|
49 |
+
;;
|
50 |
+
--output_dir=*)
|
51 |
+
OUTPUT_DIR="${arg#*=}"
|
52 |
+
;;
|
53 |
+
--hostfile)
|
54 |
+
HOSTFILE="$1"
|
55 |
+
shift
|
56 |
+
;;
|
57 |
+
--hostfile=*)
|
58 |
+
HOSTFILE="${arg#*=}"
|
59 |
+
;;
|
60 |
+
--zero_stage)
|
61 |
+
ZERO_STAGE="$1"
|
62 |
+
shift
|
63 |
+
;;
|
64 |
+
--zero_stage=*)
|
65 |
+
ZERO_STAGE="${arg#*=}"
|
66 |
+
;;
|
67 |
+
--offload)
|
68 |
+
OFFLOAD="$1"
|
69 |
+
shift
|
70 |
+
;;
|
71 |
+
--offload=*)
|
72 |
+
OFFLOAD="${arg#*=}"
|
73 |
+
;;
|
74 |
+
*)
|
75 |
+
echo "Unknown parameter passed: '${arg}'" >&2
|
76 |
+
exit 1
|
77 |
+
;;
|
78 |
+
esac
|
79 |
+
done
|
80 |
+
|
81 |
+
mkdir -p "${OUTPUT_DIR}"
|
82 |
+
OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
|
83 |
+
if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
|
84 |
+
echo '*' >"${OUTPUT_DIR}/.gitignore"
|
85 |
+
fi
|
86 |
+
|
87 |
+
cp -f "$0" "${OUTPUT_DIR}/script.sh"
|
88 |
+
|
89 |
+
if [[ -z "${WANDB_API_KEY}" ]]; then
|
90 |
+
export WANDB_MODE="offline"
|
91 |
+
fi
|
92 |
+
|
93 |
+
MASTER_PORT_START=10000
|
94 |
+
MASTER_PORT_END=65535
|
95 |
+
MASTER_PORT="$(
|
96 |
+
comm -23 \
|
97 |
+
<(seq "${MASTER_PORT_START}" "${MASTER_PORT_END}" | sort) \
|
98 |
+
<(ss -Htan | awk '{ print $4 }' | awk -F ':' '{ print $NF }' | sort -u) |
|
99 |
+
shuf | head -n 1
|
100 |
+
)"
|
101 |
+
|
102 |
+
DEEPSPEED_ARGS=()
|
103 |
+
if [[ -n "${HOSTFILE+x}" ]]; then
|
104 |
+
DEEPSPEED_ARGS+=("--hostfile" "${HOSTFILE}")
|
105 |
+
fi
|
106 |
+
DEEPSPEED_ARGS+=("--master_port" "${MASTER_PORT}")
|
107 |
+
|
108 |
+
exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
|
109 |
+
|
110 |
+
deepspeed "${DEEPSPEED_ARGS[@]}" \
|
111 |
+
--module safe_rlhf.finetune \
|
112 |
+
--train_datasets alpaca \
|
113 |
+
--model_name_or_path "${MODEL_NAME_OR_PATH}" \
|
114 |
+
--max_length 1024 \
|
115 |
+
--trust_remote_code True \
|
116 |
+
--epochs 3 \
|
117 |
+
--per_device_train_batch_size 8 \
|
118 |
+
--per_device_eval_batch_size 8 \
|
119 |
+
--gradient_accumulation_steps 8 \
|
120 |
+
--gradient_checkpointing \
|
121 |
+
--learning_rate 2e-5 \
|
122 |
+
--lr_scheduler_type cosine \
|
123 |
+
--lr_warmup_ratio 0.03 \
|
124 |
+
--weight_decay 0.0 \
|
125 |
+
--seed 42 \
|
126 |
+
--output_dir "${OUTPUT_DIR}" \
|
127 |
+
--log_type wandb \
|
128 |
+
--log_project SFT-alpaca \
|
129 |
+
--zero_stage "${ZERO_STAGE}" \
|
130 |
+
--offload "${OFFLOAD}" \
|
131 |
+
--bf16 True \
|
132 |
+
--tf32 True
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
stderr.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
stdout.log
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2024-09-04 12:51:56,498] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
2 |
+
[2024-09-04 12:51:58,109] [WARNING] [runner.py:212:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
|
3 |
+
[2024-09-04 12:51:58,109] [INFO] [runner.py:585:main] cmd = /home/juntao/Miniconda3/envs/roo/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=15892 --module --enable_each_rank_log=None safe_rlhf.finetune --train_datasets alpaca --model_name_or_path models/alpaca-7b-reproduced --max_length 1024 --trust_remote_code True --epochs 3 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 2e-5 --lr_scheduler_type cosine --lr_warmup_ratio 0.03 --weight_decay 0.0 --seed 42 --output_dir /home/juntao/Projects/roo/models/alpaca-7b-sft --log_type wandb --log_project SFT-alpaca --zero_stage 3 --offload none --bf16 True --tf32 True
|
4 |
+
[2024-09-04 12:51:59,771] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
5 |
+
[2024-09-04 12:52:01,856] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
|
6 |
+
[2024-09-04 12:52:01,856] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0
|
7 |
+
[2024-09-04 12:52:01,856] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
|
8 |
+
[2024-09-04 12:52:01,856] [INFO] [launch.py:164:main] dist_world_size=8
|
9 |
+
[2024-09-04 12:52:01,856] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
10 |
+
[2024-09-04 12:52:01,858] [INFO] [launch.py:256:main] process 2131706 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=0', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
11 |
+
[2024-09-04 12:52:01,859] [INFO] [launch.py:256:main] process 2131707 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=1', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
12 |
+
[2024-09-04 12:52:01,860] [INFO] [launch.py:256:main] process 2131708 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=2', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
13 |
+
[2024-09-04 12:52:01,861] [INFO] [launch.py:256:main] process 2131709 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=3', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
14 |
+
[2024-09-04 12:52:01,862] [INFO] [launch.py:256:main] process 2131710 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=4', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
15 |
+
[2024-09-04 12:52:01,863] [INFO] [launch.py:256:main] process 2131711 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=5', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
16 |
+
[2024-09-04 12:52:01,864] [INFO] [launch.py:256:main] process 2131712 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=6', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
17 |
+
[2024-09-04 12:52:01,865] [INFO] [launch.py:256:main] process 2131713 spawned with command: ['/home/juntao/Miniconda3/envs/roo/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=7', '--train_datasets', 'alpaca', '--model_name_or_path', 'models/alpaca-7b-reproduced', '--max_length', '1024', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '8', '--per_device_eval_batch_size', '8', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '2e-5', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.03', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/juntao/Projects/roo/models/alpaca-7b-sft', '--log_type', 'wandb', '--log_project', 'SFT-alpaca', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True']
|
18 |
+
[2024-09-04 12:52:04,266] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
19 |
+
[2024-09-04 12:52:05,590] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
20 |
+
[2024-09-04 12:52:05,973] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
21 |
+
[2024-09-04 12:52:06,006] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
22 |
+
[2024-09-04 12:52:06,071] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
23 |
+
[2024-09-04 12:52:06,085] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
24 |
+
[2024-09-04 12:52:06,108] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
25 |
+
[2024-09-04 12:52:06,170] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
26 |
+
[2024-09-04 12:52:07,786] [INFO] [comm.py:652:init_distributed] cdb=None
|
27 |
+
[2024-09-04 12:52:08,898] [INFO] [comm.py:652:init_distributed] cdb=None
|
28 |
+
[2024-09-04 12:52:09,351] [INFO] [comm.py:652:init_distributed] cdb=None
|
29 |
+
[2024-09-04 12:52:09,394] [INFO] [comm.py:652:init_distributed] cdb=None
|
30 |
+
[2024-09-04 12:52:09,425] [INFO] [comm.py:652:init_distributed] cdb=None
|
31 |
+
[2024-09-04 12:52:09,488] [INFO] [comm.py:652:init_distributed] cdb=None
|
32 |
+
[2024-09-04 12:52:09,525] [INFO] [comm.py:652:init_distributed] cdb=None
|
33 |
+
[2024-09-04 12:52:09,589] [INFO] [comm.py:652:init_distributed] cdb=None
|
34 |
+
[2024-09-04 12:52:09,589] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
35 |
+
Set logger level to WARNING.
|
36 |
+
ninja: no work to do.
|
37 |
+
Time to load fused_adam op: 0.13483238220214844 seconds
|
38 |
+
Time to load fused_adam op: 0.20363140106201172 seconds
|
39 |
+
Time to load fused_adam op: 0.2036271095275879 secondsTime to load fused_adam op: 0.20359563827514648 seconds
|
40 |
+
|
41 |
+
Time to load fused_adam op: 0.20377779006958008 seconds
|
42 |
+
Time to load fused_adam op: 0.2039332389831543 seconds
|
43 |
+
Time to load fused_adam op: 0.20380067825317383 seconds
|
44 |
+
Time to load fused_adam op: 0.20460724830627441 seconds
|
45 |
+
Parameter Offload: Total persistent parameters: 266240 in 65 params
|
46 |
+
***** Running training *****
|
47 |
+
Saving model to "/home/juntao/Projects/roo/models/alpaca-7b-sft" ...
|
48 |
+
Saving DeepSpeed Checkpoints...
|
49 |
+
Converting DeepSpeed Checkpoints to Hugging Face format...
|
50 |
+
[2024-09-04 13:41:40,053] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
51 |
+
Processing zero checkpoint './global_step304'
|
52 |
+
Detected checkpoint of type zero stage ZeroStageEnum.weights, world_size: 8
|
53 |
+
Parsing checkpoint created by deepspeed==0.15.0
|
54 |
+
Reconstructed Trainable fp32 state dict with 291 params 6738423808 elements
|
55 |
+
Saving fp32 state dict to pytorch_model.bin
|
56 |
+
Model saved!
|
57 |
+
[2024-09-04 13:42:46,338] [INFO] [launch.py:351:main] Process 2131710 exits successfully.
|
58 |
+
[2024-09-04 13:42:46,338] [INFO] [launch.py:351:main] Process 2131712 exits successfully.
|
59 |
+
[2024-09-04 13:42:46,338] [INFO] [launch.py:351:main] Process 2131707 exits successfully.
|
60 |
+
[2024-09-04 13:42:46,338] [INFO] [launch.py:351:main] Process 2131713 exits successfully.
|
61 |
+
[2024-09-04 13:42:46,339] [INFO] [launch.py:351:main] Process 2131709 exits successfully.
|
62 |
+
[2024-09-04 13:42:46,339] [INFO] [launch.py:351:main] Process 2131711 exits successfully.
|
63 |
+
[2024-09-04 13:42:47,339] [INFO] [launch.py:351:main] Process 2131708 exits successfully.
|
64 |
+
[2024-09-04 13:42:56,341] [INFO] [launch.py:351:main] Process 2131706 exits successfully.
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"32000": {
|
30 |
+
"content": "<pad>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
}
|
37 |
+
},
|
38 |
+
"bos_token": "<s>",
|
39 |
+
"clean_up_tokenization_spaces": false,
|
40 |
+
"eos_token": "</s>",
|
41 |
+
"legacy": true,
|
42 |
+
"model_max_length": 1024,
|
43 |
+
"pad_token": "<pad>",
|
44 |
+
"padding_side": "right",
|
45 |
+
"sp_model_kwargs": {},
|
46 |
+
"spaces_between_special_tokens": false,
|
47 |
+
"tokenizer_class": "LlamaTokenizer",
|
48 |
+
"unk_token": "<unk>",
|
49 |
+
"use_default_system_prompt": false
|
50 |
+
}
|
zero_to_fp32.py
ADDED
@@ -0,0 +1,604 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
# Copyright (c) Microsoft Corporation.
|
4 |
+
# SPDX-License-Identifier: Apache-2.0
|
5 |
+
|
6 |
+
# DeepSpeed Team
|
7 |
+
|
8 |
+
# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
|
9 |
+
# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
|
10 |
+
# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
|
11 |
+
# application.
|
12 |
+
#
|
13 |
+
# example: python zero_to_fp32.py . pytorch_model.bin
|
14 |
+
|
15 |
+
import argparse
|
16 |
+
import torch
|
17 |
+
import glob
|
18 |
+
import math
|
19 |
+
import os
|
20 |
+
import re
|
21 |
+
from collections import OrderedDict
|
22 |
+
from dataclasses import dataclass
|
23 |
+
|
24 |
+
# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
|
25 |
+
# DeepSpeed data structures it has to be available in the current python environment.
|
26 |
+
from deepspeed.utils import logger
|
27 |
+
from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
|
28 |
+
FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
|
29 |
+
FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
|
30 |
+
|
31 |
+
|
32 |
+
@dataclass
|
33 |
+
class zero_model_state:
|
34 |
+
buffers: dict()
|
35 |
+
param_shapes: dict()
|
36 |
+
shared_params: list
|
37 |
+
ds_version: int
|
38 |
+
frozen_param_shapes: dict()
|
39 |
+
frozen_param_fragments: dict()
|
40 |
+
|
41 |
+
|
42 |
+
debug = 0
|
43 |
+
|
44 |
+
# load to cpu
|
45 |
+
device = torch.device('cpu')
|
46 |
+
|
47 |
+
|
48 |
+
def atoi(text):
|
49 |
+
return int(text) if text.isdigit() else text
|
50 |
+
|
51 |
+
|
52 |
+
def natural_keys(text):
|
53 |
+
'''
|
54 |
+
alist.sort(key=natural_keys) sorts in human order
|
55 |
+
http://nedbatchelder.com/blog/200712/human_sorting.html
|
56 |
+
(See Toothy's implementation in the comments)
|
57 |
+
'''
|
58 |
+
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
59 |
+
|
60 |
+
|
61 |
+
def get_model_state_file(checkpoint_dir, zero_stage):
|
62 |
+
if not os.path.isdir(checkpoint_dir):
|
63 |
+
raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
|
64 |
+
|
65 |
+
# there should be only one file
|
66 |
+
if zero_stage <= 2:
|
67 |
+
file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
|
68 |
+
elif zero_stage == 3:
|
69 |
+
file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
|
70 |
+
|
71 |
+
if not os.path.exists(file):
|
72 |
+
raise FileNotFoundError(f"can't find model states file at '{file}'")
|
73 |
+
|
74 |
+
return file
|
75 |
+
|
76 |
+
|
77 |
+
def get_checkpoint_files(checkpoint_dir, glob_pattern):
|
78 |
+
# XXX: need to test that this simple glob rule works for multi-node setup too
|
79 |
+
ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
|
80 |
+
|
81 |
+
if len(ckpt_files) == 0:
|
82 |
+
raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
|
83 |
+
|
84 |
+
return ckpt_files
|
85 |
+
|
86 |
+
|
87 |
+
def get_optim_files(checkpoint_dir):
|
88 |
+
return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
|
89 |
+
|
90 |
+
|
91 |
+
def get_model_state_files(checkpoint_dir):
|
92 |
+
return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
|
93 |
+
|
94 |
+
|
95 |
+
def parse_model_states(files):
|
96 |
+
zero_model_states = []
|
97 |
+
for file in files:
|
98 |
+
state_dict = torch.load(file, map_location=device)
|
99 |
+
|
100 |
+
if BUFFER_NAMES not in state_dict:
|
101 |
+
raise ValueError(f"{file} is not a model state checkpoint")
|
102 |
+
buffer_names = state_dict[BUFFER_NAMES]
|
103 |
+
if debug:
|
104 |
+
print("Found buffers:", buffer_names)
|
105 |
+
|
106 |
+
# recover just the buffers while restoring them to fp32 if they were saved in fp16
|
107 |
+
buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
|
108 |
+
param_shapes = state_dict[PARAM_SHAPES]
|
109 |
+
|
110 |
+
# collect parameters that are included in param_shapes
|
111 |
+
param_names = []
|
112 |
+
for s in param_shapes:
|
113 |
+
for name in s.keys():
|
114 |
+
param_names.append(name)
|
115 |
+
|
116 |
+
# update with frozen parameters
|
117 |
+
frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
|
118 |
+
if frozen_param_shapes is not None:
|
119 |
+
if debug:
|
120 |
+
print(f"Found frozen_param_shapes: {frozen_param_shapes}")
|
121 |
+
param_names += list(frozen_param_shapes.keys())
|
122 |
+
|
123 |
+
# handle shared params
|
124 |
+
shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
|
125 |
+
|
126 |
+
ds_version = state_dict.get(DS_VERSION, None)
|
127 |
+
|
128 |
+
frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
|
129 |
+
|
130 |
+
z_model_state = zero_model_state(buffers=buffers,
|
131 |
+
param_shapes=param_shapes,
|
132 |
+
shared_params=shared_params,
|
133 |
+
ds_version=ds_version,
|
134 |
+
frozen_param_shapes=frozen_param_shapes,
|
135 |
+
frozen_param_fragments=frozen_param_fragments)
|
136 |
+
zero_model_states.append(z_model_state)
|
137 |
+
|
138 |
+
return zero_model_states
|
139 |
+
|
140 |
+
|
141 |
+
def parse_optim_states(files, ds_checkpoint_dir):
|
142 |
+
|
143 |
+
total_files = len(files)
|
144 |
+
state_dicts = []
|
145 |
+
for f in files:
|
146 |
+
state_dict = torch.load(f, map_location=device)
|
147 |
+
# immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
|
148 |
+
# and also handle the case where it was already removed by another helper script
|
149 |
+
state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
|
150 |
+
state_dicts.append(state_dict)
|
151 |
+
|
152 |
+
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
153 |
+
raise ValueError(f"{files[0]} is not a zero checkpoint")
|
154 |
+
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
|
155 |
+
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
|
156 |
+
|
157 |
+
# For ZeRO-2 each param group can have different partition_count as data parallelism for expert
|
158 |
+
# parameters can be different from data parallelism for non-expert parameters. So we can just
|
159 |
+
# use the max of the partition_count to get the dp world_size.
|
160 |
+
|
161 |
+
if type(world_size) is list:
|
162 |
+
world_size = max(world_size)
|
163 |
+
|
164 |
+
if world_size != total_files:
|
165 |
+
raise ValueError(
|
166 |
+
f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
|
167 |
+
"Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
|
168 |
+
)
|
169 |
+
|
170 |
+
# the groups are named differently in each stage
|
171 |
+
if zero_stage <= 2:
|
172 |
+
fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
|
173 |
+
elif zero_stage == 3:
|
174 |
+
fp32_groups_key = FP32_FLAT_GROUPS
|
175 |
+
else:
|
176 |
+
raise ValueError(f"unknown zero stage {zero_stage}")
|
177 |
+
|
178 |
+
if zero_stage <= 2:
|
179 |
+
fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
|
180 |
+
elif zero_stage == 3:
|
181 |
+
# if there is more than one param group, there will be multiple flattened tensors - one
|
182 |
+
# flattened tensor per group - for simplicity merge them into a single tensor
|
183 |
+
#
|
184 |
+
# XXX: could make the script more memory efficient for when there are multiple groups - it
|
185 |
+
# will require matching the sub-lists of param_shapes for each param group flattened tensor
|
186 |
+
|
187 |
+
fp32_flat_groups = [
|
188 |
+
torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
|
189 |
+
]
|
190 |
+
|
191 |
+
return zero_stage, world_size, fp32_flat_groups
|
192 |
+
|
193 |
+
|
194 |
+
def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
|
195 |
+
"""
|
196 |
+
Returns fp32 state_dict reconstructed from ds checkpoint
|
197 |
+
|
198 |
+
Args:
|
199 |
+
- ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
|
200 |
+
|
201 |
+
"""
|
202 |
+
print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
|
203 |
+
|
204 |
+
optim_files = get_optim_files(ds_checkpoint_dir)
|
205 |
+
zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
|
206 |
+
print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
|
207 |
+
|
208 |
+
model_files = get_model_state_files(ds_checkpoint_dir)
|
209 |
+
|
210 |
+
zero_model_states = parse_model_states(model_files)
|
211 |
+
print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
|
212 |
+
|
213 |
+
if zero_stage <= 2:
|
214 |
+
return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
215 |
+
exclude_frozen_parameters)
|
216 |
+
elif zero_stage == 3:
|
217 |
+
return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
218 |
+
exclude_frozen_parameters)
|
219 |
+
|
220 |
+
|
221 |
+
def _zero2_merge_frozen_params(state_dict, zero_model_states):
|
222 |
+
if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
|
223 |
+
return
|
224 |
+
|
225 |
+
frozen_param_shapes = zero_model_states[0].frozen_param_shapes
|
226 |
+
frozen_param_fragments = zero_model_states[0].frozen_param_fragments
|
227 |
+
|
228 |
+
if debug:
|
229 |
+
num_elem = sum(s.numel() for s in frozen_param_shapes.values())
|
230 |
+
print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
|
231 |
+
|
232 |
+
wanted_params = len(frozen_param_shapes)
|
233 |
+
wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
|
234 |
+
avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
|
235 |
+
print(f'Frozen params: Have {avail_numel} numels to process.')
|
236 |
+
print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
|
237 |
+
|
238 |
+
total_params = 0
|
239 |
+
total_numel = 0
|
240 |
+
for name, shape in frozen_param_shapes.items():
|
241 |
+
total_params += 1
|
242 |
+
unpartitioned_numel = shape.numel()
|
243 |
+
total_numel += unpartitioned_numel
|
244 |
+
|
245 |
+
state_dict[name] = frozen_param_fragments[name]
|
246 |
+
|
247 |
+
if debug:
|
248 |
+
print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
|
249 |
+
|
250 |
+
print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
|
251 |
+
|
252 |
+
|
253 |
+
def _has_callable(obj, fn):
|
254 |
+
attr = getattr(obj, fn, None)
|
255 |
+
return callable(attr)
|
256 |
+
|
257 |
+
|
258 |
+
def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
|
259 |
+
param_shapes = zero_model_states[0].param_shapes
|
260 |
+
|
261 |
+
# Reconstruction protocol:
|
262 |
+
#
|
263 |
+
# XXX: document this
|
264 |
+
|
265 |
+
if debug:
|
266 |
+
for i in range(world_size):
|
267 |
+
for j in range(len(fp32_flat_groups[0])):
|
268 |
+
print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
|
269 |
+
|
270 |
+
# XXX: memory usage doubles here (zero2)
|
271 |
+
num_param_groups = len(fp32_flat_groups[0])
|
272 |
+
merged_single_partition_of_fp32_groups = []
|
273 |
+
for i in range(num_param_groups):
|
274 |
+
merged_partitions = [sd[i] for sd in fp32_flat_groups]
|
275 |
+
full_single_fp32_vector = torch.cat(merged_partitions, 0)
|
276 |
+
merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
|
277 |
+
avail_numel = sum(
|
278 |
+
[full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
|
279 |
+
|
280 |
+
if debug:
|
281 |
+
wanted_params = sum([len(shapes) for shapes in param_shapes])
|
282 |
+
wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
|
283 |
+
# not asserting if there is a mismatch due to possible padding
|
284 |
+
print(f"Have {avail_numel} numels to process.")
|
285 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
286 |
+
|
287 |
+
# params
|
288 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
289 |
+
# out-of-core computing solution
|
290 |
+
total_numel = 0
|
291 |
+
total_params = 0
|
292 |
+
for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
|
293 |
+
offset = 0
|
294 |
+
avail_numel = full_single_fp32_vector.numel()
|
295 |
+
for name, shape in shapes.items():
|
296 |
+
|
297 |
+
unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
|
298 |
+
total_numel += unpartitioned_numel
|
299 |
+
total_params += 1
|
300 |
+
|
301 |
+
if debug:
|
302 |
+
print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
|
303 |
+
state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
|
304 |
+
offset += unpartitioned_numel
|
305 |
+
|
306 |
+
# Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
|
307 |
+
# avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
|
308 |
+
# paddings performed in the code it's almost impossible to predict the exact numbers w/o the
|
309 |
+
# live optimizer object, so we are checking that the numbers are within the right range
|
310 |
+
align_to = 2 * world_size
|
311 |
+
|
312 |
+
def zero2_align(x):
|
313 |
+
return align_to * math.ceil(x / align_to)
|
314 |
+
|
315 |
+
if debug:
|
316 |
+
print(f"original offset={offset}, avail_numel={avail_numel}")
|
317 |
+
|
318 |
+
offset = zero2_align(offset)
|
319 |
+
avail_numel = zero2_align(avail_numel)
|
320 |
+
|
321 |
+
if debug:
|
322 |
+
print(f"aligned offset={offset}, avail_numel={avail_numel}")
|
323 |
+
|
324 |
+
# Sanity check
|
325 |
+
if offset != avail_numel:
|
326 |
+
raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
327 |
+
|
328 |
+
print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
|
329 |
+
|
330 |
+
|
331 |
+
def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
332 |
+
exclude_frozen_parameters):
|
333 |
+
state_dict = OrderedDict()
|
334 |
+
|
335 |
+
# buffers
|
336 |
+
buffers = zero_model_states[0].buffers
|
337 |
+
state_dict.update(buffers)
|
338 |
+
if debug:
|
339 |
+
print(f"added {len(buffers)} buffers")
|
340 |
+
|
341 |
+
if not exclude_frozen_parameters:
|
342 |
+
_zero2_merge_frozen_params(state_dict, zero_model_states)
|
343 |
+
|
344 |
+
_zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
|
345 |
+
|
346 |
+
# recover shared parameters
|
347 |
+
for pair in zero_model_states[0].shared_params:
|
348 |
+
if pair[1] in state_dict:
|
349 |
+
state_dict[pair[0]] = state_dict[pair[1]]
|
350 |
+
|
351 |
+
return state_dict
|
352 |
+
|
353 |
+
|
354 |
+
def zero3_partitioned_param_info(unpartitioned_numel, world_size):
|
355 |
+
remainder = unpartitioned_numel % world_size
|
356 |
+
padding_numel = (world_size - remainder) if remainder else 0
|
357 |
+
partitioned_numel = math.ceil(unpartitioned_numel / world_size)
|
358 |
+
return partitioned_numel, padding_numel
|
359 |
+
|
360 |
+
|
361 |
+
def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
|
362 |
+
if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
|
363 |
+
return
|
364 |
+
|
365 |
+
if debug:
|
366 |
+
for i in range(world_size):
|
367 |
+
num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
|
368 |
+
print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
|
369 |
+
|
370 |
+
frozen_param_shapes = zero_model_states[0].frozen_param_shapes
|
371 |
+
wanted_params = len(frozen_param_shapes)
|
372 |
+
wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
|
373 |
+
avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
|
374 |
+
print(f'Frozen params: Have {avail_numel} numels to process.')
|
375 |
+
print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
|
376 |
+
|
377 |
+
total_params = 0
|
378 |
+
total_numel = 0
|
379 |
+
for name, shape in zero_model_states[0].frozen_param_shapes.items():
|
380 |
+
total_params += 1
|
381 |
+
unpartitioned_numel = shape.numel()
|
382 |
+
total_numel += unpartitioned_numel
|
383 |
+
|
384 |
+
param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
|
385 |
+
state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
|
386 |
+
|
387 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
388 |
+
|
389 |
+
if debug:
|
390 |
+
print(
|
391 |
+
f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
392 |
+
)
|
393 |
+
|
394 |
+
print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
|
395 |
+
|
396 |
+
|
397 |
+
def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
|
398 |
+
param_shapes = zero_model_states[0].param_shapes
|
399 |
+
avail_numel = fp32_flat_groups[0].numel() * world_size
|
400 |
+
# Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
|
401 |
+
# param, re-consolidating each param, while dealing with padding if any
|
402 |
+
|
403 |
+
# merge list of dicts, preserving order
|
404 |
+
param_shapes = {k: v for d in param_shapes for k, v in d.items()}
|
405 |
+
|
406 |
+
if debug:
|
407 |
+
for i in range(world_size):
|
408 |
+
print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
|
409 |
+
|
410 |
+
wanted_params = len(param_shapes)
|
411 |
+
wanted_numel = sum(shape.numel() for shape in param_shapes.values())
|
412 |
+
# not asserting if there is a mismatch due to possible padding
|
413 |
+
avail_numel = fp32_flat_groups[0].numel() * world_size
|
414 |
+
print(f"Trainable params: Have {avail_numel} numels to process.")
|
415 |
+
print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
|
416 |
+
|
417 |
+
# params
|
418 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
419 |
+
# out-of-core computing solution
|
420 |
+
offset = 0
|
421 |
+
total_numel = 0
|
422 |
+
total_params = 0
|
423 |
+
for name, shape in param_shapes.items():
|
424 |
+
|
425 |
+
unpartitioned_numel = shape.numel()
|
426 |
+
total_numel += unpartitioned_numel
|
427 |
+
total_params += 1
|
428 |
+
|
429 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
430 |
+
|
431 |
+
if debug:
|
432 |
+
print(
|
433 |
+
f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
434 |
+
)
|
435 |
+
|
436 |
+
# XXX: memory usage doubles here
|
437 |
+
state_dict[name] = torch.cat(
|
438 |
+
tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
|
439 |
+
0).narrow(0, 0, unpartitioned_numel).view(shape)
|
440 |
+
offset += partitioned_numel
|
441 |
+
|
442 |
+
offset *= world_size
|
443 |
+
|
444 |
+
# Sanity check
|
445 |
+
if offset != avail_numel:
|
446 |
+
raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
447 |
+
|
448 |
+
print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
|
449 |
+
|
450 |
+
|
451 |
+
def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
452 |
+
exclude_frozen_parameters):
|
453 |
+
state_dict = OrderedDict()
|
454 |
+
|
455 |
+
# buffers
|
456 |
+
buffers = zero_model_states[0].buffers
|
457 |
+
state_dict.update(buffers)
|
458 |
+
if debug:
|
459 |
+
print(f"added {len(buffers)} buffers")
|
460 |
+
|
461 |
+
if not exclude_frozen_parameters:
|
462 |
+
_zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
|
463 |
+
|
464 |
+
_zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
|
465 |
+
|
466 |
+
# recover shared parameters
|
467 |
+
for pair in zero_model_states[0].shared_params:
|
468 |
+
if pair[1] in state_dict:
|
469 |
+
state_dict[pair[0]] = state_dict[pair[1]]
|
470 |
+
|
471 |
+
return state_dict
|
472 |
+
|
473 |
+
|
474 |
+
def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
|
475 |
+
"""
|
476 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
|
477 |
+
``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
|
478 |
+
via a model hub.
|
479 |
+
|
480 |
+
Args:
|
481 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder
|
482 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
|
483 |
+
- ``exclude_frozen_parameters``: exclude frozen parameters
|
484 |
+
|
485 |
+
Returns:
|
486 |
+
- pytorch ``state_dict``
|
487 |
+
|
488 |
+
Note: this approach may not work if your application doesn't have sufficient free CPU memory and
|
489 |
+
you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
|
490 |
+
the checkpoint.
|
491 |
+
|
492 |
+
A typical usage might be ::
|
493 |
+
|
494 |
+
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
495 |
+
# do the training and checkpoint saving
|
496 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
|
497 |
+
model = model.cpu() # move to cpu
|
498 |
+
model.load_state_dict(state_dict)
|
499 |
+
# submit to model hub or save the model to share with others
|
500 |
+
|
501 |
+
In this example the ``model`` will no longer be usable in the deepspeed context of the same
|
502 |
+
application. i.e. you will need to re-initialize the deepspeed engine, since
|
503 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
504 |
+
|
505 |
+
If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
|
506 |
+
|
507 |
+
"""
|
508 |
+
if tag is None:
|
509 |
+
latest_path = os.path.join(checkpoint_dir, 'latest')
|
510 |
+
if os.path.isfile(latest_path):
|
511 |
+
with open(latest_path, 'r') as fd:
|
512 |
+
tag = fd.read().strip()
|
513 |
+
else:
|
514 |
+
raise ValueError(f"Unable to find 'latest' file at {latest_path}")
|
515 |
+
|
516 |
+
ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
|
517 |
+
|
518 |
+
if not os.path.isdir(ds_checkpoint_dir):
|
519 |
+
raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
|
520 |
+
|
521 |
+
return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
|
522 |
+
|
523 |
+
|
524 |
+
def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
|
525 |
+
"""
|
526 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
|
527 |
+
loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
|
528 |
+
|
529 |
+
Args:
|
530 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
531 |
+
- ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
|
532 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
533 |
+
- ``exclude_frozen_parameters``: exclude frozen parameters
|
534 |
+
"""
|
535 |
+
|
536 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
|
537 |
+
print(f"Saving fp32 state dict to {output_file}")
|
538 |
+
torch.save(state_dict, output_file)
|
539 |
+
|
540 |
+
|
541 |
+
def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
|
542 |
+
"""
|
543 |
+
1. Put the provided model to cpu
|
544 |
+
2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
|
545 |
+
3. Load it into the provided model
|
546 |
+
|
547 |
+
Args:
|
548 |
+
- ``model``: the model object to update
|
549 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
550 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
551 |
+
|
552 |
+
Returns:
|
553 |
+
- ``model`: modified model
|
554 |
+
|
555 |
+
Make sure you have plenty of CPU memory available before you call this function. If you don't
|
556 |
+
have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
|
557 |
+
conveniently placed for you in the checkpoint folder.
|
558 |
+
|
559 |
+
A typical usage might be ::
|
560 |
+
|
561 |
+
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
562 |
+
model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
|
563 |
+
# submit to model hub or save the model to share with others
|
564 |
+
|
565 |
+
Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
|
566 |
+
of the same application. i.e. you will need to re-initialize the deepspeed engine, since
|
567 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
568 |
+
|
569 |
+
"""
|
570 |
+
logger.info(f"Extracting fp32 weights")
|
571 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
572 |
+
|
573 |
+
logger.info(f"Overwriting model with fp32 weights")
|
574 |
+
model = model.cpu()
|
575 |
+
model.load_state_dict(state_dict, strict=False)
|
576 |
+
|
577 |
+
return model
|
578 |
+
|
579 |
+
|
580 |
+
if __name__ == "__main__":
|
581 |
+
|
582 |
+
parser = argparse.ArgumentParser()
|
583 |
+
parser.add_argument("checkpoint_dir",
|
584 |
+
type=str,
|
585 |
+
help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
|
586 |
+
parser.add_argument(
|
587 |
+
"output_file",
|
588 |
+
type=str,
|
589 |
+
help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
|
590 |
+
parser.add_argument("-t",
|
591 |
+
"--tag",
|
592 |
+
type=str,
|
593 |
+
default=None,
|
594 |
+
help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
|
595 |
+
parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
|
596 |
+
parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
|
597 |
+
args = parser.parse_args()
|
598 |
+
|
599 |
+
debug = args.debug
|
600 |
+
|
601 |
+
convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
|
602 |
+
args.output_file,
|
603 |
+
tag=args.tag,
|
604 |
+
exclude_frozen_parameters=args.exclude_frozen_parameters)
|