volcengine · tongyx361 · Jun 13, 2025 · Jun 5, 2025
diff --git a/.github/workflows/e2e_ppo_trainer_megatron.yml b/.github/workflows/e2e_ppo_trainer_megatron.yml
@@ -40,19 +40,46 @@ concurrency:
 permissions:
   contents: read
 
+env:
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3"
+  DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
+
 jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      runner-label: ${{ steps.create-runner.outputs.runner_label }}
+      mlp-task-id: ${{ steps.create-runner.outputs.mlp_task_id }}
+    steps:
+      - name: create runner
+        id: create-runner
+        shell: bash
+        run: |
+          if [[ "${{ github.event.repository.full_name }}" != "volcengine/verl" ]]; then
+            echo "no need create runner, skip"
+            exit 0
+          fi
+          resp=$(curl -X POST "${{ env.DYNAMIC_RUNNER_ENDPOINT }}/create" \
+          -d '{"Image": "${{ env.IMAGE }}"}')
+          runner_label=$(echo $resp | jq -r '.runner_label')
+          if [[ -z $runner_label || $runner_label == "null" ]]; then
+            echo "create runner failed"
+            exit 1
+          fi
+          echo "runner_label=$runner_label" >> $GITHUB_OUTPUT   
+          mlp_task_id=$(echo $resp | jq -r '.task_id')
+          echo "mlp_task_id=$mlp_task_id" >> $GITHUB_OUTPUT 
+
   e2e_ppo_trainer_megatron-deepseek:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -84,17 +111,15 @@ jobs:
         run: |
           rm -rf checkpoints
   e2e_ppo_trainer_megatron-qwen3:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -123,17 +148,15 @@ jobs:
         run: |
           rm -rf checkpoints
   e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -156,17 +179,15 @@ jobs:
         run: |
           rm -rf checkpoints
   e2e_ppo_trainer_megatron-qwen-override-transformer-config:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -195,17 +216,15 @@ jobs:
         run: |
           rm -rf checkpoints
   e2e_ppo_trainer_megatron-deepseek-override-transformer-config:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -229,17 +248,15 @@ jobs:
         run: |
           rm -rf checkpoints
   e2e_ppo_trainer_megatron-moe-expert-parallel:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -263,17 +280,15 @@ jobs:
         run: |
           rm -rf checkpoints
   e2e_ppo_trainer_megatron-qwen2_5vl-3b:
-    runs-on: [L20x8]
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -294,3 +309,24 @@ jobs:
       - name: clean up
         run: |
           rm -rf checkpoints
+
+  cleanup: 
+    runs-on: ubuntu-latest  
+    needs: [setup, 
+      e2e_ppo_trainer_megatron-deepseek, 
+      e2e_ppo_trainer_megatron-qwen3,
+      e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding,
+      e2e_ppo_trainer_megatron-qwen-override-transformer-config,
+      e2e_ppo_trainer_megatron-deepseek-override-transformer-config,
+      e2e_ppo_trainer_megatron-moe-expert-parallel,
+      e2e_ppo_trainer_megatron-qwen2_5vl-3b]        
+    if: always()          
+    steps:
+      - name: remove runner
+        run: |
+          if [[ -z "${{ needs.setup.outputs.mlp-task-id }}" ]]; then
+            echo "no need remove runner, skip"
+            exit 0
+          fi
+          resp=$(curl -X POST "${{ env.DYNAMIC_RUNNER_ENDPOINT }}/delete" \
+          -d '{"TaskId": "${{ needs.setup.outputs.mlp-task-id }}"}')