Skip to content

Commit b5dc163

Browse files
committed
fix: preserve user images in native tool call results
When native tool calls are enabled, user images sent with responses were being converted to placeholder text '(image content)' instead of being preserved and sent to the LLM. This fix: - Preserves image blocks in tool_result content as arrays when images are present - Only converts to string when no images exist (for cleaner representation) - Maintains compatibility with Anthropic API's tool_result format Also adds comprehensive test coverage for image handling in both native and XML protocols.
1 parent 18c4d1a commit b5dc163

File tree

2 files changed

+228
-13
lines changed

2 files changed

+228
-13
lines changed
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
// npx vitest src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
2+
3+
import { describe, it, expect, beforeEach, vi } from "vitest"
4+
import { Anthropic } from "@anthropic-ai/sdk"
5+
import { presentAssistantMessage } from "../presentAssistantMessage"
6+
import { Task } from "../../task/Task"
7+
import { TOOL_PROTOCOL } from "@roo-code/types"
8+
9+
// Mock dependencies
10+
vi.mock("../../task/Task")
11+
vi.mock("../../tools/validateToolUse", () => ({
12+
validateToolUse: vi.fn(),
13+
}))
14+
vi.mock("@roo-code/telemetry", () => ({
15+
TelemetryService: {
16+
instance: {
17+
captureToolUsage: vi.fn(),
18+
captureConsecutiveMistakeError: vi.fn(),
19+
},
20+
},
21+
}))
22+
23+
describe("presentAssistantMessage - Image Handling in Native Tool Calls", () => {
24+
let mockTask: any
25+
26+
beforeEach(() => {
27+
// Create a mock Task with minimal properties needed for testing
28+
mockTask = {
29+
taskId: "test-task-id",
30+
instanceId: "test-instance",
31+
abort: false,
32+
presentAssistantMessageLocked: false,
33+
presentAssistantMessageHasPendingUpdates: false,
34+
currentStreamingContentIndex: 0,
35+
assistantMessageContent: [],
36+
userMessageContent: [],
37+
didCompleteReadingStream: false,
38+
didRejectTool: false,
39+
didAlreadyUseTool: false,
40+
diffEnabled: false,
41+
consecutiveMistakeCount: 0,
42+
api: {
43+
getModel: () => ({ id: "test-model", info: {} }),
44+
},
45+
browserSession: {
46+
closeBrowser: vi.fn().mockResolvedValue(undefined),
47+
},
48+
recordToolUsage: vi.fn(),
49+
toolRepetitionDetector: {
50+
check: vi.fn().mockReturnValue({ allowExecution: true }),
51+
},
52+
providerRef: {
53+
deref: () => ({
54+
getState: vi.fn().mockResolvedValue({
55+
mode: "code",
56+
customModes: [],
57+
}),
58+
}),
59+
},
60+
say: vi.fn().mockResolvedValue(undefined),
61+
ask: vi.fn().mockResolvedValue({ response: "yesButtonClicked" }),
62+
}
63+
})
64+
65+
it("should preserve images in tool_result for native protocol", async () => {
66+
// Set up a tool_use block with an ID (indicates native protocol)
67+
const toolCallId = "tool_call_123"
68+
mockTask.assistantMessageContent = [
69+
{
70+
type: "tool_use",
71+
id: toolCallId, // ID indicates native protocol
72+
name: "ask_followup_question",
73+
params: { question: "What do you see?" },
74+
},
75+
]
76+
77+
// Create a mock askApproval that includes images in the response
78+
const imageBlock: Anthropic.ImageBlockParam = {
79+
type: "image",
80+
source: {
81+
type: "base64",
82+
media_type: "image/png",
83+
data: "base64ImageData",
84+
},
85+
}
86+
87+
mockTask.ask = vi.fn().mockResolvedValue({
88+
response: "yesButtonClicked",
89+
text: "I see a cat",
90+
images: [""],
91+
})
92+
93+
// Execute presentAssistantMessage
94+
await presentAssistantMessage(mockTask)
95+
96+
// Verify that userMessageContent was populated
97+
expect(mockTask.userMessageContent.length).toBeGreaterThan(0)
98+
99+
// Find the tool_result block
100+
const toolResult = mockTask.userMessageContent.find(
101+
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
102+
)
103+
104+
expect(toolResult).toBeDefined()
105+
expect(toolResult.tool_use_id).toBe(toolCallId)
106+
107+
// Check if content is an array (images should be preserved as array)
108+
// When images are present, content should be an array containing image blocks
109+
if (Array.isArray(toolResult.content)) {
110+
// Images were preserved!
111+
const hasImageBlock = toolResult.content.some((block: any) => block.type === "image")
112+
expect(hasImageBlock).toBe(true)
113+
} else {
114+
// If it's a string, images were NOT preserved (this is the bug we're fixing)
115+
// This test should PASS after the fix
116+
expect(Array.isArray(toolResult.content)).toBe(true)
117+
}
118+
})
119+
120+
it("should convert to string when no images are present (native protocol)", async () => {
121+
// Set up a tool_use block with an ID (indicates native protocol)
122+
const toolCallId = "tool_call_456"
123+
mockTask.assistantMessageContent = [
124+
{
125+
type: "tool_use",
126+
id: toolCallId,
127+
name: "ask_followup_question",
128+
params: { question: "What is your name?" },
129+
},
130+
]
131+
132+
// Response with text but NO images
133+
mockTask.ask = vi.fn().mockResolvedValue({
134+
response: "yesButtonClicked",
135+
text: "My name is Alice",
136+
images: undefined,
137+
})
138+
139+
await presentAssistantMessage(mockTask)
140+
141+
const toolResult = mockTask.userMessageContent.find(
142+
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
143+
)
144+
145+
expect(toolResult).toBeDefined()
146+
147+
// When no images, content should be a string
148+
expect(typeof toolResult.content).toBe("string")
149+
})
150+
151+
it("should preserve images in content array for XML protocol (existing behavior)", async () => {
152+
// Set up a tool_use block WITHOUT an ID (indicates XML protocol)
153+
mockTask.assistantMessageContent = [
154+
{
155+
type: "tool_use",
156+
// No ID = XML protocol
157+
name: "ask_followup_question",
158+
params: { question: "What do you see?" },
159+
},
160+
]
161+
162+
mockTask.ask = vi.fn().mockResolvedValue({
163+
response: "yesButtonClicked",
164+
text: "I see a dog",
165+
images: [""],
166+
})
167+
168+
await presentAssistantMessage(mockTask)
169+
170+
// For XML protocol, content is added as separate blocks
171+
// Check that both text and image blocks were added
172+
const hasTextBlock = mockTask.userMessageContent.some((item: any) => item.type === "text")
173+
const hasImageBlock = mockTask.userMessageContent.some((item: any) => item.type === "image")
174+
175+
expect(hasTextBlock).toBe(true)
176+
// XML protocol preserves images as separate blocks in userMessageContent
177+
expect(hasImageBlock).toBe(true)
178+
})
179+
180+
it("should handle empty tool result gracefully", async () => {
181+
const toolCallId = "tool_call_789"
182+
mockTask.assistantMessageContent = [
183+
{
184+
type: "tool_use",
185+
id: toolCallId,
186+
name: "attempt_completion",
187+
params: { result: "Task completed" },
188+
},
189+
]
190+
191+
// Empty response
192+
mockTask.ask = vi.fn().mockResolvedValue({
193+
response: "yesButtonClicked",
194+
text: undefined,
195+
images: undefined,
196+
})
197+
198+
await presentAssistantMessage(mockTask)
199+
200+
const toolResult = mockTask.userMessageContent.find(
201+
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
202+
)
203+
204+
expect(toolResult).toBeDefined()
205+
// Should have fallback text
206+
expect(toolResult.content).toBeTruthy()
207+
})
208+
})

src/core/assistant-message/presentAssistantMessage.ts

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -297,22 +297,29 @@ export async function presentAssistantMessage(cline: Task) {
297297
}
298298

299299
// For native protocol, add as tool_result block
300-
let resultContent: string
300+
// Preserve image blocks in the content array instead of converting to strings
301+
let resultContent: string | Array<Anthropic.TextBlockParam | Anthropic.ImageBlockParam>
301302
if (typeof content === "string") {
302303
resultContent = content || "(tool did not return anything)"
303304
} else {
304-
// Convert array of content blocks to string for tool result
305-
// Tool results in OpenAI format only support strings
306-
resultContent = content
307-
.map((item) => {
308-
if (item.type === "text") {
309-
return item.text
310-
} else if (item.type === "image") {
311-
return "(image content)"
312-
}
313-
return ""
314-
})
315-
.join("\n")
305+
// Preserve both text and image blocks in the content array
306+
// This allows images sent by the user to be included in tool results
307+
const hasImages = content.some((item) => item.type === "image")
308+
if (hasImages) {
309+
// Keep as array to preserve image blocks
310+
resultContent = content
311+
} else {
312+
// If no images, convert to string for simpler representation
313+
resultContent =
314+
content
315+
.map((item) => {
316+
if (item.type === "text") {
317+
return item.text
318+
}
319+
return ""
320+
})
321+
.join("\n") || "(tool did not return anything)"
322+
}
316323
}
317324

318325
cline.userMessageContent.push({

0 commit comments

Comments
 (0)