Skip to content

Commit a515c89

Browse files
authored
feat: add evalmonkey generate-ci command for easy GitHub Actions setup (#6)
1 parent a9fc06a commit a515c89

1 file changed

Lines changed: 57 additions & 0 deletions

File tree

scripts/cli.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,63 @@ def init(
5151
console.print(" 2. Run: [bold]evalmonkey run-benchmark --scenario mmlu[/bold]")
5252
console.print(" 3. Run: [bold]evalmonkey run-chaos --scenario mmlu --chaos-profile client_prompt_injection[/bold]\n")
5353

54+
@app.command(name="generate-ci")
55+
def generate_ci(
56+
output: str = typer.Option(".github/workflows/evalmonkey.yml", help="Path to write the GitHub Actions workflow")
57+
):
58+
"""
59+
Generate a GitHub Actions workflow to run EvalMonkey on every pull request.
60+
This creates an automated feedback loop for agent reliability!
61+
"""
62+
import os
63+
from pathlib import Path
64+
65+
CI_YAML_TEMPLATE = """name: Agent Reliability Benchmark
66+
67+
on:
68+
pull_request:
69+
branches: [ main, master ]
70+
71+
jobs:
72+
evalmonkey-benchmark:
73+
runs-on: ubuntu-latest
74+
steps:
75+
- uses: actions/checkout@v3
76+
77+
- name: Set up Python
78+
uses: actions/setup-python@v4
79+
with:
80+
python-version: '3.11'
81+
82+
- name: Install EvalMonkey
83+
run: pip install git+https://github.com/Corbell-AI/evalmonkey.git
84+
85+
# TODO: Add steps here to install dependencies and start your agent in the background
86+
# - name: Install Agent dependencies
87+
# run: pip install -r requirements.txt
88+
# - name: Start Agent
89+
# run: python src/agent.py &
90+
# env:
91+
# OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
92+
93+
- name: Wait for Agent
94+
run: sleep 5
95+
96+
- name: Run EvalMonkey Benchmark
97+
run: evalmonkey run-benchmark --scenario gsm8k --limit 10
98+
env:
99+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
100+
EVAL_MODEL: "gpt-4o"
101+
"""
102+
os.makedirs(os.path.dirname(output), exist_ok=True)
103+
with open(output, "w") as f:
104+
f.write(CI_YAML_TEMPLATE)
105+
106+
console.print(f"[bold green]✅ Generated CI/CD workflow at {output}[/bold green]")
107+
console.print("This will run EvalMonkey benchmarks on every PR to ensure your agent's reliability.")
108+
console.print("Make sure to update the workflow to start your agent in the background!")
109+
110+
54111
@app.command()
55112
def list_benchmarks():
56113
"""Lists the 10 off-the-shelf benchmark datasets natively supported."""

0 commit comments

Comments
 (0)