Skip to content

Commit 525d1ea

Browse files
authored
feat: update triage LLMs to GPT-5.4 (#333)
1 parent 1d50cca commit 525d1ea

12 files changed

Lines changed: 576 additions & 149 deletions

File tree

packages/triage/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ Required environment variables:
4242

4343
Model dependency:
4444

45-
- `extractClaimsFromNewEvidence` uses `gpt-5-mini`.
46-
- `triageNewEvidence` uses `gpt-5-mini`.
47-
- title/slug generation uses `gpt-5-nano`.
45+
- `extractClaimsFromNewEvidence` uses `gpt-5.4-mini`.
46+
- `triageNewEvidence` uses `gpt-5.4-mini`.
47+
- title/slug generation uses `gpt-5.4-nano`.
4848
- translation uses `gpt-5.4-nano`.
4949

5050
Expected cost before running the checked-in eval set: less than USD 1 with the
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import { describe, expect, it } from 'vitest';
2+
import {
3+
estimateOpenAICostFromUsage,
4+
OpenAIUsageCostTracker,
5+
} from './estimateOpenAICost.js';
6+
7+
describe('estimateOpenAICostFromUsage', () => {
8+
it('prices gpt-5.4 mini usage with cached input tokens', () => {
9+
const estimate = estimateOpenAICostFromUsage({
10+
model: 'gpt-5.4-mini',
11+
usage: {
12+
inputTokens: 1000,
13+
cachedInputTokens: 100,
14+
outputTokens: 2000,
15+
totalTokens: 3000,
16+
},
17+
});
18+
19+
expect(estimate?.estimatedCostUsd).toBeCloseTo(0.0096825);
20+
});
21+
22+
it('prices gpt-5.4 nano usage', () => {
23+
const estimate = estimateOpenAICostFromUsage({
24+
model: 'gpt-5.4-nano',
25+
usage: {
26+
inputTokens: 1000,
27+
cachedInputTokens: 100,
28+
outputTokens: 2000,
29+
totalTokens: 3000,
30+
},
31+
});
32+
33+
expect(estimate?.estimatedCostUsd).toBeCloseTo(0.002682);
34+
});
35+
});
36+
37+
describe('OpenAIUsageCostTracker', () => {
38+
it('sums usage and cost across multiple responses', () => {
39+
const tracker = new OpenAIUsageCostTracker();
40+
41+
tracker.add({
42+
model: 'gpt-5.4-mini',
43+
usage: {
44+
inputTokens: 1000,
45+
cachedInputTokens: 100,
46+
outputTokens: 2000,
47+
totalTokens: 3000,
48+
},
49+
});
50+
tracker.add({
51+
model: 'gpt-5.4-mini',
52+
usage: {
53+
inputTokens: 3000,
54+
cachedInputTokens: 200,
55+
outputTokens: 4000,
56+
totalTokens: 7000,
57+
},
58+
});
59+
60+
const summary = tracker.summary();
61+
62+
expect(summary.estimatedCostUsd).toBeCloseTo(0.0291975);
63+
expect(summary).toEqual({
64+
usage: {
65+
inputTokens: 4000,
66+
cachedInputTokens: 300,
67+
outputTokens: 6000,
68+
totalTokens: 10000,
69+
},
70+
estimatedCostUsd: summary.estimatedCostUsd,
71+
modelsWithoutPricing: [],
72+
});
73+
});
74+
75+
it('tracks models without configured pricing', () => {
76+
const tracker = new OpenAIUsageCostTracker();
77+
78+
tracker.add({
79+
model: 'unknown-model',
80+
usage: {
81+
inputTokens: 1000,
82+
cachedInputTokens: 0,
83+
outputTokens: 1000,
84+
totalTokens: 2000,
85+
},
86+
});
87+
88+
expect(tracker.summary()).toEqual({
89+
usage: {
90+
inputTokens: 1000,
91+
cachedInputTokens: 0,
92+
outputTokens: 1000,
93+
totalTokens: 2000,
94+
},
95+
estimatedCostUsd: null,
96+
modelsWithoutPricing: ['unknown-model'],
97+
});
98+
});
99+
});

packages/triage/src/helpers/estimateOpenAICost.ts

Lines changed: 101 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,27 @@ export type OpenAIModelPricing = {
3535
outputUsdPer1MTokens: number;
3636
};
3737

38+
export type OpenAIUsageCostSummary = {
39+
usage: OpenAITokenUsage | null;
40+
estimatedCostUsd: number | null;
41+
modelsWithoutPricing: string[];
42+
};
43+
3844
export const OPENAI_MODEL_PRICING: Record<string, OpenAIModelPricing> = {
39-
'gpt-5-mini': {
40-
inputUsdPer1MTokens: 0.25,
41-
cachedInputUsdPer1MTokens: 0.025,
42-
outputUsdPer1MTokens: 2,
45+
'gpt-5.4': {
46+
inputUsdPer1MTokens: 2.5,
47+
cachedInputUsdPer1MTokens: 0.25,
48+
outputUsdPer1MTokens: 15,
4349
},
44-
'gpt-5-nano': {
45-
inputUsdPer1MTokens: 0.05,
46-
cachedInputUsdPer1MTokens: 0.005,
47-
outputUsdPer1MTokens: 0.4,
50+
'gpt-5.4-mini': {
51+
inputUsdPer1MTokens: 0.75,
52+
cachedInputUsdPer1MTokens: 0.075,
53+
outputUsdPer1MTokens: 4.5,
54+
},
55+
'gpt-5.4-nano': {
56+
inputUsdPer1MTokens: 0.2,
57+
cachedInputUsdPer1MTokens: 0.02,
58+
outputUsdPer1MTokens: 1.25,
4859
},
4960
};
5061

@@ -114,3 +125,85 @@ export function estimateOpenAICostFromUsage({
114125
pricing,
115126
};
116127
}
128+
129+
export function sumOpenAITokenUsage(
130+
left: OpenAITokenUsage | null,
131+
right: OpenAITokenUsage | null,
132+
): OpenAITokenUsage | null {
133+
if (left == null) {
134+
return right;
135+
}
136+
if (right == null) {
137+
return left;
138+
}
139+
140+
return {
141+
inputTokens: left.inputTokens + right.inputTokens,
142+
cachedInputTokens: left.cachedInputTokens + right.cachedInputTokens,
143+
outputTokens: left.outputTokens + right.outputTokens,
144+
totalTokens: left.totalTokens + right.totalTokens,
145+
};
146+
}
147+
148+
export class OpenAIUsageCostTracker {
149+
private usage: OpenAITokenUsage | null = null;
150+
private estimatedCostUsd = 0;
151+
private readonly modelsWithoutPricing = new Set<string>();
152+
153+
add({ model, usage }: { model: string; usage: OpenAITokenUsage | null }) {
154+
if (usage == null) {
155+
return;
156+
}
157+
158+
this.usage = sumOpenAITokenUsage(this.usage, usage);
159+
160+
const estimate = estimateOpenAICostFromUsage({ model, usage });
161+
if (estimate == null) {
162+
this.modelsWithoutPricing.add(model);
163+
return;
164+
}
165+
166+
this.estimatedCostUsd += estimate.estimatedCostUsd;
167+
}
168+
169+
summary(): OpenAIUsageCostSummary {
170+
return {
171+
usage: this.usage,
172+
estimatedCostUsd:
173+
this.modelsWithoutPricing.size === 0 ? this.estimatedCostUsd : null,
174+
modelsWithoutPricing: [...this.modelsWithoutPricing].sort(),
175+
};
176+
}
177+
}
178+
179+
export function logOpenAIUsageCostSummary({
180+
label,
181+
summary,
182+
}: {
183+
label: string;
184+
summary: OpenAIUsageCostSummary;
185+
}) {
186+
if (summary.usage == null) {
187+
console.log(`[${label}] Usage is unavailable`);
188+
return;
189+
}
190+
191+
console.log(`[${label}] Total usage:`, {
192+
inputTokens: summary.usage.inputTokens,
193+
cachedInputTokens: summary.usage.cachedInputTokens,
194+
outputTokens: summary.usage.outputTokens,
195+
totalTokens: summary.usage.totalTokens,
196+
});
197+
198+
if (summary.estimatedCostUsd != null) {
199+
console.log(
200+
`[${label}] Total estimated cost (USD):`,
201+
summary.estimatedCostUsd.toFixed(8),
202+
);
203+
return;
204+
}
205+
206+
console.log(
207+
`[${label}] No pricing configured for model(s): ${summary.modelsWithoutPricing.join(', ')}`,
208+
);
209+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import { describe, expect, it, vi } from 'vitest';
2+
import { isRetryableOpenAIError, runOpenAIRequestWithRetry } from './client.js';
3+
4+
describe('isRetryableOpenAIError', () => {
5+
it('treats transient statuses as retryable', () => {
6+
expect(isRetryableOpenAIError({ status: 408 })).toBe(true);
7+
expect(isRetryableOpenAIError({ status: 409 })).toBe(true);
8+
expect(isRetryableOpenAIError({ status: 429 })).toBe(true);
9+
expect(isRetryableOpenAIError({ status: 500 })).toBe(true);
10+
});
11+
12+
it('does not retry client validation errors', () => {
13+
expect(isRetryableOpenAIError({ status: 400 })).toBe(false);
14+
expect(isRetryableOpenAIError({ status: 422 })).toBe(false);
15+
});
16+
});
17+
18+
describe('runOpenAIRequestWithRetry', () => {
19+
it('retries a transient failure and returns the successful result', async () => {
20+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
21+
const request = vi
22+
.fn<() => Promise<string>>()
23+
.mockRejectedValueOnce({ status: 500 })
24+
.mockResolvedValueOnce('ok');
25+
const sleep = vi.fn<() => Promise<void>>().mockResolvedValue(undefined);
26+
27+
await expect(
28+
runOpenAIRequestWithRetry(request, {
29+
label: 'testRequest',
30+
initialDelayMs: 5,
31+
sleep,
32+
}),
33+
).resolves.toBe('ok');
34+
35+
expect(request).toHaveBeenCalledTimes(2);
36+
expect(sleep).toHaveBeenCalledWith(5);
37+
warnSpy.mockRestore();
38+
});
39+
40+
it('does not retry non-retryable failures', async () => {
41+
const request = vi.fn<() => Promise<string>>().mockRejectedValue({
42+
status: 400,
43+
});
44+
const sleep = vi.fn<() => Promise<void>>().mockResolvedValue(undefined);
45+
46+
await expect(
47+
runOpenAIRequestWithRetry(request, {
48+
label: 'testRequest',
49+
sleep,
50+
}),
51+
).rejects.toEqual({ status: 400 });
52+
53+
expect(request).toHaveBeenCalledTimes(1);
54+
expect(sleep).not.toHaveBeenCalled();
55+
});
56+
});

packages/triage/src/llm/client.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
import OpenAI from 'openai';
22

3+
export type OpenAIRetryOptions = {
4+
label: string;
5+
maxAttempts?: number;
6+
initialDelayMs?: number;
7+
sleep?: (ms: number) => Promise<void>;
8+
};
9+
310
export function getOpenAiClient() {
411
const apiKey = process.env.OPENAI_API_KEY;
512
if (apiKey == null || apiKey.trim() === '') {
@@ -10,3 +17,75 @@ export function getOpenAiClient() {
1017
apiKey,
1118
});
1219
}
20+
21+
export async function runOpenAIRequestWithRetry<T>(
22+
request: () => Promise<T>,
23+
{
24+
label,
25+
maxAttempts = 4,
26+
initialDelayMs = 500,
27+
sleep = sleepMs,
28+
}: OpenAIRetryOptions,
29+
): Promise<T> {
30+
let attempt = 1;
31+
32+
while (true) {
33+
try {
34+
return await request();
35+
} catch (error) {
36+
if (attempt >= maxAttempts || !isRetryableOpenAIError(error)) {
37+
throw error;
38+
}
39+
40+
const delayMs = initialDelayMs * 2 ** (attempt - 1);
41+
console.warn(
42+
`${label}: OpenAI request failed with a retryable error; retrying attempt ${attempt + 1}/${maxAttempts} in ${delayMs}ms.`,
43+
);
44+
45+
await sleep(delayMs);
46+
attempt++;
47+
}
48+
}
49+
}
50+
51+
export function isRetryableOpenAIError(error: unknown): boolean {
52+
if (error == null || typeof error !== 'object') {
53+
return false;
54+
}
55+
56+
const status = getNumericProperty(error, 'status');
57+
if (status != null) {
58+
return status === 408 || status === 409 || status === 429 || status >= 500;
59+
}
60+
61+
const code = getStringProperty(error, 'code');
62+
return (
63+
code === 'server_error' ||
64+
code === 'rate_limit_exceeded' ||
65+
code === 'timeout'
66+
);
67+
}
68+
69+
function getNumericProperty(value: object, key: string): number | null {
70+
if (!(key in value)) {
71+
return null;
72+
}
73+
74+
const property = value[key as keyof typeof value];
75+
return typeof property === 'number' ? property : null;
76+
}
77+
78+
function getStringProperty(value: object, key: string): string | null {
79+
if (!(key in value)) {
80+
return null;
81+
}
82+
83+
const property = value[key as keyof typeof value];
84+
return typeof property === 'string' ? property : null;
85+
}
86+
87+
function sleepMs(ms: number): Promise<void> {
88+
return new Promise((resolve) => {
89+
setTimeout(resolve, ms);
90+
});
91+
}

0 commit comments

Comments
 (0)