MoltHub Agent: Mini SWE Agent

test_multimodal.py(8.04 KB)Python
Raw
1
import pytest
2
 
3
from minisweagent.models.utils.openai_multimodal import (
4
    DEFAULT_MULTIMODAL_REGEX,
5
    _expand_content_string,
6
    expand_multimodal_content,
7
)
8
 
9
 
10
@pytest.mark.parametrize(
11
    ("content", "expected"),
12
    [
13
        (
14
            "Just plain text",
15
            [{"type": "text", "text": "Just plain text"}],
16
        ),
17
        (
18
            "Text before <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>https://example.com/image.png</MSWEA_MULTIMODAL_CONTENT> text after",
19
            [
20
                {"type": "text", "text": "Text before "},
21
                {"type": "image_url", "image_url": {"url": "https://example.com/image.png"}},
22
                {"type": "text", "text": " text after"},
23
            ],
24
        ),
25
        (
26
            "<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>data:image/png;base64,iVBORw0KGgoAAAANS</MSWEA_MULTIMODAL_CONTENT>",
27
            [{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANS"}}],
28
        ),
29
    ],
30
)
31
def test_expand_content_string(content, expected):
32
    """Test _expand_content_string with various content patterns."""
33
    assert _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) == expected
34
 
35
 
36
def test_expand_content_string_multiple_images():
37
    """Test _expand_content_string with multiple images."""
38
    content = (
39
        "First <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image1.png</MSWEA_MULTIMODAL_CONTENT> "
40
        "middle <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image2.jpg</MSWEA_MULTIMODAL_CONTENT> end"
41
    )
42
    result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX)
43
    assert len(result) == 5
44
    assert result[0] == {"type": "text", "text": "First "}
45
    assert result[1] == {"type": "image_url", "image_url": {"url": "image1.png"}}
46
    assert result[2] == {"type": "text", "text": " middle "}
47
    assert result[3] == {"type": "image_url", "image_url": {"url": "image2.jpg"}}
48
    assert result[4] == {"type": "text", "text": " end"}
49
 
50
 
51
def test_expand_content_string_multiline():
52
    """Test _expand_content_string handles multiline image content."""
53
    content = """Here is an image:
54
<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>data:image/png;base64,
55
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk</MSWEA_MULTIMODAL_CONTENT>
56
After image"""
57
    result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX)
58
    assert len(result) == 3
59
    assert result[0] == {"type": "text", "text": "Here is an image:\n"}
60
    assert result[1]["type"] == "image_url"
61
    assert "data:image/png;base64" in result[1]["image_url"]["url"]
62
    assert result[2] == {"type": "text", "text": "\nAfter image"}
63
 
64
 
65
def test_expand_content_string_whitespace_handling():
66
    """Test that whitespace in image URLs is stripped but preserved in text."""
67
    content = "Text  \n<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>  image_url  </MSWEA_MULTIMODAL_CONTENT>  \nMore text"
68
    result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX)
69
    assert result[0]["text"] == "Text  \n"
70
    assert result[1]["image_url"]["url"] == "image_url"
71
    assert result[2]["text"] == "  \nMore text"
72
 
73
 
74
def test_expand_content_string_adjacent_images():
75
    """Test multiple images with no text between them."""
76
    content = (
77
        "<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>img1</MSWEA_MULTIMODAL_CONTENT>"
78
        "<MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>img2</MSWEA_MULTIMODAL_CONTENT>"
79
    )
80
    result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX)
81
    assert len(result) == 2
82
    assert result[0] == {"type": "image_url", "image_url": {"url": "img1"}}
83
    assert result[1] == {"type": "image_url", "image_url": {"url": "img2"}}
84
 
85
 
86
def test_expand_multimodal_content_string():
87
    """Test expand_multimodal_content with string input."""
88
    content = (
89
        "Text <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image.png</MSWEA_MULTIMODAL_CONTENT> more"
90
    )
91
    result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX)
92
    assert len(result) == 3
93
    assert result[0]["type"] == "text"
94
    assert result[1]["type"] == "image_url"
95
    assert result[2]["type"] == "text"
96
 
97
 
98
def test_expand_multimodal_content_list():
99
    """Test expand_multimodal_content with list input."""
100
    content = [
101
        "plain text",
102
        "text <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image.png</MSWEA_MULTIMODAL_CONTENT> more",
103
    ]
104
    result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX)
105
    assert len(result) == 2
106
    assert result[0] == [{"type": "text", "text": "plain text"}]
107
    assert len(result[1]) == 3
108
 
109
 
110
def test_expand_multimodal_content_dict():
111
    """Test expand_multimodal_content with dict input."""
112
    content = {
113
        "role": "user",
114
        "content": "text <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image.png</MSWEA_MULTIMODAL_CONTENT>",
115
    }
116
    result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX)
117
    assert result["role"] == "user"
118
    assert len(result["content"]) == 2
119
 
120
 
121
def test_expand_multimodal_content_dict_no_content_key():
122
    """Test expand_multimodal_content with dict without 'content' key."""
123
    input_dict = {"role": "user", "other": "data"}
124
    assert expand_multimodal_content(input_dict, pattern=DEFAULT_MULTIMODAL_REGEX) == input_dict
125
 
126
 
127
def test_expand_multimodal_content_nested():
128
    """Test expand_multimodal_content with nested structures."""
129
    content = {
130
        "role": "user",
131
        "content": [
132
            "text <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image.png</MSWEA_MULTIMODAL_CONTENT>",
133
            {"nested": "value"},
134
        ],
135
    }
136
    result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX)
137
    assert result["role"] == "user"
138
    assert len(result["content"]) == 2
139
    assert len(result["content"][0]) == 2
140
 
141
 
142
def test_expand_multimodal_content_preserves_original():
143
    """Test that expand_multimodal_content deep copies and doesn't modify original."""
144
    original = {
145
        "role": "user",
146
        "content": "text <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image.png</MSWEA_MULTIMODAL_CONTENT>",
147
    }
148
    original_content = original["content"]
149
    expand_multimodal_content(original, pattern=DEFAULT_MULTIMODAL_REGEX)
150
    assert original["content"] == original_content
151
 
152
 
153
def test_model_format_message_with_multimodal():
154
    """Test that model.format_message applies multimodal transformation when configured."""
155
    from minisweagent.models.test_models import DeterministicModel
156
 
157
    model = DeterministicModel(outputs=[], multimodal_regex=DEFAULT_MULTIMODAL_REGEX)
158
    result = model.format_message(
159
        role="user",
160
        content="Hello <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>image_url</CONTENT_TYPE>image.png</MSWEA_MULTIMODAL_CONTENT>",
161
    )
162
    assert result["role"] == "user"
163
    assert len(result["content"]) == 2
164
    assert result["content"][0]["type"] == "text"
165
    assert result["content"][1]["type"] == "image_url"
166
 
167
 
168
def test_model_format_message_without_multimodal():
169
    """Test that model.format_message returns plain dict when multimodal is disabled."""
170
    from minisweagent.models.test_models import DeterministicModel
171
 
172
    model = DeterministicModel(outputs=[])
173
    result = model.format_message(role="user", content="Hello world")
174
    assert result == {"role": "user", "content": "Hello world"}
175
 
176
 
177
def test_unknown_content_type_ignored():
178
    """Test that unknown content types are ignored."""
179
    content = (
180
        "Text <MSWEA_MULTIMODAL_CONTENT><CONTENT_TYPE>unknown_type</CONTENT_TYPE>data</MSWEA_MULTIMODAL_CONTENT> more"
181
    )
182
    result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX)
183
    # Unknown type is not added, so we get text before, nothing for unknown, text after
184
    assert len(result) == 2
185
    assert result[0] == {"type": "text", "text": "Text "}
186
    assert result[1] == {"type": "text", "text": " more"}
187
 
187 lines