Spaces:
Paused
Paused
name: Self-hosted runner (nightly-ci) | |
# Note that each job's dependencies go into a corresponding docker file. | |
# | |
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is | |
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at | |
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` | |
on: | |
repository_dispatch: | |
workflow_call: | |
env: | |
HF_HOME: /mnt/cache | |
TRANSFORMERS_IS_CI: yes | |
OMP_NUM_THREADS: 8 | |
MKL_NUM_THREADS: 8 | |
RUN_SLOW: yes | |
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} | |
TF_FORCE_GPU_ALLOW_GROWTH: true | |
RUN_PT_TF_CROSS_TESTS: 1 | |
jobs: | |
check_runner_status: | |
name: Check Runner Status | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout transformers | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 2 | |
- name: Check Runner Status | |
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | |
check_runners: | |
name: Check Runners | |
needs: check_runner_status | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
setup: | |
name: Setup | |
needs: check_runners | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
outputs: | |
matrix: ${{ steps.set-matrix.outputs.matrix }} | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: | | |
git fetch && git checkout ${{ github.sha }} | |
- name: Cleanup | |
working-directory: /transformers | |
run: | | |
rm -rf tests/__pycache__ | |
rm -rf tests/models/__pycache__ | |
rm -rf reports | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- id: set-matrix | |
name: Identify models to test | |
working-directory: /transformers/tests | |
run: | | |
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
run_tests_single_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [single-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_tests_multi_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_all_tests_torch_cuda_extensions_gpu: | |
name: Torch CUDA extension tests | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | |
needs: setup | |
container: | |
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Update clone | |
working-directory: /workspace/transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /workspace/transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: Remove cached torch extensions | |
run: rm -rf /github/home/.cache/torch_extensions/ | |
# To avoid unknown test failures | |
- name: Pre build DeepSpeed *again* | |
working-directory: /workspace | |
run: | | |
python3 -m pip uninstall -y deepspeed | |
rm -rf DeepSpeed | |
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build | |
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /workspace/transformers | |
run: | | |
python utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /workspace/transformers | |
run: pip freeze | |
- name: Run all tests on GPU | |
working-directory: /workspace/transformers | |
run: | | |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly | |
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu | |
send_results: | |
name: Send results to webhook | |
runs-on: ubuntu-latest | |
if: always() | |
needs: [ | |
check_runner_status, | |
check_runners, | |
setup, | |
run_tests_single_gpu, | |
run_tests_multi_gpu, | |
run_all_tests_torch_cuda_extensions_gpu | |
] | |
steps: | |
- name: Preliminary job status | |
shell: bash | |
# For the meaning of these environment variables, see the job `Setup` | |
run: | | |
echo "Runner availability: ${{ needs.check_runner_status.result }}" | |
echo "Runner status: ${{ needs.check_runners.result }}" | |
echo "Setup status: ${{ needs.setup.result }}" | |
- uses: actions/checkout@v3 | |
- uses: actions/download-artifact@v3 | |
- name: Send message to Slack | |
env: | |
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | |
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} | |
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} | |
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} | |
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | |
CI_EVENT: Nightly CI | |
RUNNER_STATUS: ${{ needs.check_runner_status.result }} | |
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} | |
SETUP_STATUS: ${{ needs.setup.result }} | |
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change | |
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. | |
run: | | |
pip install slack_sdk | |
pip show slack_sdk | |
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" | |
# delete-artifact | |
- uses: geekyeggo/delete-artifact@v2 | |
with: | |
name: | | |
single-* | |
multi-* |