diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index cfef72e194..b5e89a2b5e 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -23,19 +23,40 @@ jobs: filters: ".github/file-filter.yml" self: - name: Georgia Tech | Phoenix (NVHPC) + name: "${{ matrix.name }} (${{ matrix.device }})" if: ${{ github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && ( (github.event_name == 'pull_request_review' && github.event.review.state == 'approved') || (github.event_name == 'pull_request' && github.event.pull_request.user.login == 'sbryngelson') - ) }} + ) }} needs: file-changes strategy: - matrix: - device: ['cpu', 'gpu'] fail-fast: false + matrix: + include: + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: cpu + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + build_script: "" + - cluster: frontier + name: Oak Ridge | Frontier (CCE) + group: phoenix + labels: frontier + flag: f + device: gpu + build_script: "bash .github/workflows/frontier/build.sh gpu bench" runs-on: - group: phoenix - labels: gt + group: ${{ matrix.group }} + labels: ${{ matrix.labels }} timeout-minutes: 1400 env: ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 @@ -53,15 +74,22 @@ jobs: ref: master path: master + - name: Setup & Build + if: matrix.build_script != '' + run: | + (cd pr && ${{ matrix.build_script }}) & + (cd master && ${{ matrix.build_script }}) & + wait %1 && wait %2 + - name: Bench (Master v. PR) run: | - (cd pr && bash .github/workflows/phoenix/submit-bench.sh .github/workflows/phoenix/bench.sh ${{ matrix.device }}) & - (cd master && bash .github/workflows/phoenix/submit-bench.sh .github/workflows/phoenix/bench.sh ${{ matrix.device }}) & + (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & + (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & wait %1 && wait %2 - name: Generate & Post Comment run: | - (cd pr && . ./mfc.sh load -c p -m g) + (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}.yaml ../pr/bench-${{ matrix.device }}.yaml) - name: Print Logs @@ -72,9 +100,9 @@ jobs: - name: Archive Logs uses: actions/upload-artifact@v4 - if: always() + if: always() with: - name: logs-${{ matrix.device }} + name: ${{ matrix.cluster }}-${{ matrix.device }} path: | pr/bench-${{ matrix.device }}.* pr/build/benchmarks/* diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh new file mode 100644 index 0000000000..31a514d45d --- /dev/null +++ b/.github/workflows/frontier/bench.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +n_ranks=12 + +if [ "$job_device" = "gpu" ]; then + gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') + n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node + gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi + device_opts="--gpu -g $gpu_ids" +fi + +if [ "$job_device" = "gpu" ]; then + ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks +else + ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks +fi diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 67b79ba3ba..c2e1893427 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -6,4 +6,12 @@ if [ "$1" = "gpu" ]; then fi . ./mfc.sh load -c f -m g -./mfc.sh test --dry-run -j 8 $build_opts + +if [ "$2" == "bench" ]; then + for dir in benchmarks/*/; do + dirname=$(basename "$dir") + ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts + done +else + ./mfc.sh test --dry-run -j 8 $build_opts +fi diff --git a/.github/workflows/frontier/submit-bench.sh b/.github/workflows/frontier/submit-bench.sh new file mode 100644 index 0000000000..4e498a6090 --- /dev/null +++ b/.github/workflows/frontier/submit-bench.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -e + +usage() { + echo "Usage: $0 [script.sh] [cpu|gpu]" +} + +if [ ! -z "$1" ]; then + sbatch_script_contents=`cat $1` +else + usage + exit 1 +fi + +if [ "$2" = "cpu" ]; then + sbatch_device_opts="\ +#SBATCH -n 32 # Number of cores required" +elif [ "$2" = "gpu" ]; then + sbatch_device_opts="\ +#SBATCH -n 8 # Number of cores required" +else + usage; exit 1 +fi + + +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" + +sbatch <