Skip to content

Commit a5a853f

Browse files
authored
feat(tools): image reference processing and native provider support (#1251)
* feat(tools): implement image reference processing and native provider support - Support OpenAI image edits via both Multipart form-data and JSON payloads - Automatically append reference image descriptions to prompt under [Reference Image Roles] - Support downloading image URLs for Gemini native image generation - Deduplicate reference images to optimize API request size - Add unit tests for Codex, DashScope, MiniMax, BytePlus and local/remote path resolution * fix(tools): SSRF-guard reference-image URL downloads in create_image downloadImageBytes fetched caller-supplied ref_images[].url with a plain http.Client and unbounded io.ReadAll — no SSRF validation, redirect policy, or size cap, letting the gateway dial loopback/private/metadata hosts or read arbitrarily large responses. - Validate the URL via security.Validate and pin the resolved IP, then download through security.NewSafeClient (pinned dial, no redirects). - Cap the response with a bounded read (refImageMaxBytes, 20 MB). - Reject non-HTTP(S) reference URLs up front (file://, data:, gopher://) so provider-forwarded URLs stay HTTP(S)-only; document the trust boundary between gateway-side fetch and provider-forwarded URLs. - Add regression tests: blocked loopback/private/metadata, unfollowed redirect, oversized response, and non-http(s) scheme rejection.
1 parent 389640a commit a5a853f

9 files changed

Lines changed: 1465 additions & 34 deletions

internal/providers/codex_native_image.go

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,28 +57,51 @@ func (p *CodexProvider) GenerateImage(ctx context.Context, req NativeImageReques
5757
// so stream is always true. Final assembly happens in parseNativeImageSSE which scans the
5858
// event stream for response.output_item.done (image item) or response.completed output walk.
5959
func (p *CodexProvider) buildNativeImageRequestBody(model string, req NativeImageRequest) map[string]any {
60+
tool := map[string]any{
61+
"type": "image_generation",
62+
"action": "generate",
63+
"model": req.ImageModel,
64+
"output_format": req.OutputFormat,
65+
"size": SizeFromAspect(req.AspectRatio),
66+
}
67+
68+
contentParts := []map[string]any{}
69+
70+
for _, img := range req.RefImages {
71+
if img.Base64 != "" {
72+
refMime := img.MimeType
73+
if refMime == "" {
74+
refMime = "image/png"
75+
}
76+
contentParts = append(contentParts, map[string]any{
77+
"type": "input_image",
78+
"image_url": fmt.Sprintf("data:%s;base64,%s", refMime, img.Base64),
79+
})
80+
} else if img.URL != "" {
81+
contentParts = append(contentParts, map[string]any{
82+
"type": "input_image",
83+
"image_url": img.URL,
84+
})
85+
}
86+
}
87+
88+
contentParts = append(contentParts, map[string]any{
89+
"type": "input_text",
90+
"text": req.Prompt,
91+
})
92+
6093
return map[string]any{
6194
"model": model,
6295
"stream": true,
6396
"store": false,
6497
"instructions": "Generate an image matching the user's description using the image_generation tool. Return only the image; do not describe it in text.",
6598
"input": []any{
6699
map[string]any{
67-
"role": "user",
68-
"content": []map[string]any{
69-
{"type": "input_text", "text": req.Prompt},
70-
},
71-
},
72-
},
73-
"tools": []map[string]any{
74-
{
75-
"type": "image_generation",
76-
"action": "generate",
77-
"model": req.ImageModel,
78-
"output_format": req.OutputFormat,
79-
"size": SizeFromAspect(req.AspectRatio),
100+
"role": "user",
101+
"content": contentParts,
80102
},
81103
},
104+
"tools": []map[string]any{tool},
82105
"tool_choice": map[string]any{
83106
"type": "image_generation",
84107
},

internal/providers/codex_native_image_test.go

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,3 +344,140 @@ func TestSizeFromAspect(t *testing.T) {
344344
}
345345
}
346346
}
347+
348+
// TestCodexGenerateImage_WithReferenceImage verifies that passing a single RefImage
349+
// embeds the image in the input content and does not populate input_reference in tools[0].
350+
func TestCodexGenerateImage_WithReferenceImage(t *testing.T) {
351+
var captured []byte
352+
server := mockImageServer(t, &captured)
353+
defer server.Close()
354+
355+
p := NewCodexProvider("codex-test", &staticTokenSource{token: "tok"}, server.URL, "gpt-image-2")
356+
p.retryConfig.Attempts = 1
357+
358+
req := NativeImageRequest{
359+
Model: "gpt-image-2",
360+
Prompt: "A red circle",
361+
RefImages: []RefImage{
362+
{
363+
URL: "https://example.com/ref.png",
364+
},
365+
},
366+
AspectRatio: "1:1",
367+
OutputFormat: "png",
368+
}
369+
370+
_, err := p.GenerateImage(context.Background(), req)
371+
if err != nil {
372+
t.Fatalf("GenerateImage returned error: %v", err)
373+
}
374+
375+
var body map[string]any
376+
if err := json.Unmarshal(captured, &body); err != nil {
377+
t.Fatalf("unmarshal captured body: %v", err)
378+
}
379+
380+
// 1. Verify input contains input_image and input_text in user content
381+
inputs, ok := body["input"].([]any)
382+
if !ok || len(inputs) != 1 {
383+
t.Fatalf("body[input]: expected []any length 1, got %T len %d", body["input"], len(inputs))
384+
}
385+
userMsg, ok := inputs[0].(map[string]any)
386+
if !ok {
387+
t.Fatalf("inputs[0] is not a map")
388+
}
389+
contents, ok := userMsg["content"].([]any)
390+
if !ok || len(contents) != 2 {
391+
t.Fatalf("content: expected []any length 2, got %T len %d", userMsg["content"], len(contents))
392+
}
393+
394+
imgPart, ok := contents[0].(map[string]any)
395+
if !ok {
396+
t.Fatalf("contents[0] is not a map")
397+
}
398+
if imgPart["type"] != "input_image" || imgPart["image_url"] != "https://example.com/ref.png" {
399+
t.Errorf("expected input_image with url, got: %v", imgPart)
400+
}
401+
402+
textPart, ok := contents[1].(map[string]any)
403+
if !ok {
404+
t.Fatalf("contents[1] is not a map")
405+
}
406+
if textPart["type"] != "input_text" || textPart["text"] != "A red circle" {
407+
t.Errorf("expected input_text with prompt, got: %v", textPart)
408+
}
409+
410+
// 2. Verify tools[0] does not contain input_reference
411+
tools, ok := body["tools"].([]any)
412+
if !ok || len(tools) != 1 {
413+
t.Fatalf("tools shape invalid")
414+
}
415+
tool, ok := tools[0].(map[string]any)
416+
if !ok {
417+
t.Fatalf("tools[0] not map")
418+
}
419+
if _, has := tool["input_reference"]; has {
420+
t.Error("image_generation tool must not contain 'input_reference' field")
421+
}
422+
}
423+
424+
// TestCodexGenerateImage_WithMultipleReferenceImages verifies that passing multiple RefImages
425+
// embeds all of them in the input content in the correct order.
426+
func TestCodexGenerateImage_WithMultipleReferenceImages(t *testing.T) {
427+
var captured []byte
428+
server := mockImageServer(t, &captured)
429+
defer server.Close()
430+
431+
p := NewCodexProvider("codex-test", &staticTokenSource{token: "tok"}, server.URL, "gpt-image-2")
432+
p.retryConfig.Attempts = 1
433+
434+
req := NativeImageRequest{
435+
Model: "gpt-image-2",
436+
Prompt: "A red circle",
437+
RefImages: []RefImage{
438+
{
439+
URL: "https://example.com/ref1.png",
440+
},
441+
{
442+
Base64: "b64data",
443+
MimeType: "image/jpeg",
444+
},
445+
},
446+
AspectRatio: "1:1",
447+
OutputFormat: "png",
448+
}
449+
450+
_, err := p.GenerateImage(context.Background(), req)
451+
if err != nil {
452+
t.Fatalf("GenerateImage returned error: %v", err)
453+
}
454+
455+
var body map[string]any
456+
if err := json.Unmarshal(captured, &body); err != nil {
457+
t.Fatalf("unmarshal captured body: %v", err)
458+
}
459+
460+
inputs, _ := body["input"].([]any)
461+
userMsg, _ := inputs[0].(map[string]any)
462+
contents, _ := userMsg["content"].([]any)
463+
464+
// Expected: 2 images + 1 text prompt = 3 content parts
465+
if len(contents) != 3 {
466+
t.Fatalf("content: expected []any length 3, got len %d", len(contents))
467+
}
468+
469+
imgPart1, _ := contents[0].(map[string]any)
470+
if imgPart1["type"] != "input_image" || imgPart1["image_url"] != "https://example.com/ref1.png" {
471+
t.Errorf("expected first input_image with url, got: %v", imgPart1)
472+
}
473+
474+
imgPart2, _ := contents[1].(map[string]any)
475+
if imgPart2["type"] != "input_image" || imgPart2["image_url"] != "data:image/jpeg;base64,b64data" {
476+
t.Errorf("expected second input_image with base64 data url, got: %v", imgPart2)
477+
}
478+
479+
textPart, _ := contents[2].(map[string]any)
480+
if textPart["type"] != "input_text" || textPart["text"] != "A red circle" {
481+
t.Errorf("expected input_text with prompt, got: %v", textPart)
482+
}
483+
}

internal/providers/native_image.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,18 @@ type NativeImageRequest struct {
6161

6262
// OutputFormat is the desired image format: "png" (default), "jpg", "webp".
6363
OutputFormat string
64+
65+
// RefImages contains the list of reference images.
66+
RefImages []RefImage
67+
}
68+
69+
// RefImage represents a single reference image for image-to-image or styling tasks.
70+
type RefImage struct {
71+
Data []byte
72+
Base64 string
73+
MimeType string
74+
URL string
75+
Strength float64
6476
}
6577

6678
// NativeImageResult holds the result of a native image generation call.

0 commit comments

Comments
 (0)