@@ -51,6 +51,63 @@ def init(
5151 console .print (" 2. Run: [bold]evalmonkey run-benchmark --scenario mmlu[/bold]" )
5252 console .print (" 3. Run: [bold]evalmonkey run-chaos --scenario mmlu --chaos-profile client_prompt_injection[/bold]\n " )
5353
54+ @app .command (name = "generate-ci" )
55+ def generate_ci (
56+ output : str = typer .Option (".github/workflows/evalmonkey.yml" , help = "Path to write the GitHub Actions workflow" )
57+ ):
58+ """
59+ Generate a GitHub Actions workflow to run EvalMonkey on every pull request.
60+ This creates an automated feedback loop for agent reliability!
61+ """
62+ import os
63+ from pathlib import Path
64+
65+ CI_YAML_TEMPLATE = """name: Agent Reliability Benchmark
66+
67+ on:
68+ pull_request:
69+ branches: [ main, master ]
70+
71+ jobs:
72+ evalmonkey-benchmark:
73+ runs-on: ubuntu-latest
74+ steps:
75+ - uses: actions/checkout@v3
76+
77+ - name: Set up Python
78+ uses: actions/setup-python@v4
79+ with:
80+ python-version: '3.11'
81+
82+ - name: Install EvalMonkey
83+ run: pip install git+https://github.com/Corbell-AI/evalmonkey.git
84+
85+ # TODO: Add steps here to install dependencies and start your agent in the background
86+ # - name: Install Agent dependencies
87+ # run: pip install -r requirements.txt
88+ # - name: Start Agent
89+ # run: python src/agent.py &
90+ # env:
91+ # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
92+
93+ - name: Wait for Agent
94+ run: sleep 5
95+
96+ - name: Run EvalMonkey Benchmark
97+ run: evalmonkey run-benchmark --scenario gsm8k --limit 10
98+ env:
99+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
100+ EVAL_MODEL: "gpt-4o"
101+ """
102+ os .makedirs (os .path .dirname (output ), exist_ok = True )
103+ with open (output , "w" ) as f :
104+ f .write (CI_YAML_TEMPLATE )
105+
106+ console .print (f"[bold green]✅ Generated CI/CD workflow at { output } [/bold green]" )
107+ console .print ("This will run EvalMonkey benchmarks on every PR to ensure your agent's reliability." )
108+ console .print ("Make sure to update the workflow to start your agent in the background!" )
109+
110+
54111@app .command ()
55112def list_benchmarks ():
56113 """Lists the 10 off-the-shelf benchmark datasets natively supported."""
0 commit comments