Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 64 additions & 28 deletions .github/workflows/e2e_ppo_trainer_megatron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,46 @@ concurrency:
permissions:
contents: read

env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"

jobs:
setup:
runs-on: ubuntu-latest
outputs:
runner-label: ${{ steps.create-runner.outputs.runner_label }}
mlp-task-id: ${{ steps.create-runner.outputs.mlp_task_id }}
steps:
- name: create runner
id: create-runner
shell: bash
run: |
if [[ "${{ github.event.repository.full_name }}" != "volcengine/verl" ]]; then
echo "no need create runner, skip"
exit 0
fi
resp=$(curl -X POST "${{ env.DYNAMIC_RUNNER_ENDPOINT }}/create" \
-d '{"Image": "${{ env.IMAGE }}"}')
runner_label=$(echo $resp | jq -r '.runner_label')
if [[ -z $runner_label || $runner_label == "null" ]]; then
echo "create runner failed"
exit 1
fi
echo "runner_label=$runner_label" >> $GITHUB_OUTPUT
mlp_task_id=$(echo $resp | jq -r '.task_id')
echo "mlp_task_id=$mlp_task_id" >> $GITHUB_OUTPUT

e2e_ppo_trainer_megatron-deepseek:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand Down Expand Up @@ -84,17 +111,15 @@ jobs:
run: |
rm -rf checkpoints
e2e_ppo_trainer_megatron-qwen3:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand Down Expand Up @@ -123,17 +148,15 @@ jobs:
run: |
rm -rf checkpoints
e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand All @@ -156,17 +179,15 @@ jobs:
run: |
rm -rf checkpoints
e2e_ppo_trainer_megatron-qwen-override-transformer-config:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand Down Expand Up @@ -195,17 +216,15 @@ jobs:
run: |
rm -rf checkpoints
e2e_ppo_trainer_megatron-deepseek-override-transformer-config:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand All @@ -229,17 +248,15 @@ jobs:
run: |
rm -rf checkpoints
e2e_ppo_trainer_megatron-moe-expert-parallel:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand All @@ -263,17 +280,15 @@ jobs:
run: |
rm -rf checkpoints
e2e_ppo_trainer_megatron-qwen2_5vl-3b:
runs-on: [L20x8]
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand All @@ -294,3 +309,24 @@ jobs:
- name: clean up
run: |
rm -rf checkpoints

cleanup:
runs-on: ubuntu-latest
needs: [setup,
e2e_ppo_trainer_megatron-deepseek,
e2e_ppo_trainer_megatron-qwen3,
e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding,
e2e_ppo_trainer_megatron-qwen-override-transformer-config,
e2e_ppo_trainer_megatron-deepseek-override-transformer-config,
e2e_ppo_trainer_megatron-moe-expert-parallel,
e2e_ppo_trainer_megatron-qwen2_5vl-3b]
if: always()
steps:
- name: remove runner
run: |
if [[ -z "${{ needs.setup.outputs.mlp-task-id }}" ]]; then
echo "no need remove runner, skip"
exit 0
fi
resp=$(curl -X POST "${{ env.DYNAMIC_RUNNER_ENDPOINT }}/delete" \
-d '{"TaskId": "${{ needs.setup.outputs.mlp-task-id }}"}')