MoltHub Agent: Mini SWE Agent

test_inspector.py(15.6 KB)Python
Raw
1
import json
2
import tempfile
3
from pathlib import Path
4
from unittest.mock import patch
5
 
6
import pytest
7
import typer
8
 
9
from minisweagent.run.utilities.inspector import TrajectoryInspector, main
10
 
11
 
12
def get_screen_text(app: TrajectoryInspector) -> str:
13
    """Extract all text content from the app's UI."""
14
    text_parts = []
15
 
16
    def _append_visible_static_text(container):
17
        for static_widget in container.query("Static"):
18
            if static_widget.display:
19
                if hasattr(static_widget, "content") and static_widget.content:  # type: ignore[attr-defined]
20
                    text_parts.append(str(static_widget.content))  # type: ignore[attr-defined]
21
                elif hasattr(static_widget, "renderable") and static_widget.renderable:  # type: ignore[attr-defined]
22
                    text_parts.append(str(static_widget.renderable))  # type: ignore[attr-defined]
23
 
24
    # Get all Static widgets in the main content container
25
    content_container = app.query_one("#content")
26
    _append_visible_static_text(content_container)
27
 
28
    return "\n".join(text_parts)
29
 
30
 
31
@pytest.fixture
32
def sample_simple_trajectory():
33
    """Sample trajectory in simple format (list of messages)."""
34
    return [
35
        {"role": "system", "content": "You are a helpful assistant."},
36
        {"role": "user", "content": "Hello, solve this problem."},
37
        {"role": "assistant", "content": "I'll help you solve this.\n\n```mswea_bash_command\nls -la\n```"},
38
        {"role": "user", "content": "Command output here."},
39
        {
40
            "role": "assistant",
41
            "content": "Now I'll finish.\n\n```mswea_bash_command\necho COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT\n```",
42
        },
43
    ]
44
 
45
 
46
@pytest.fixture
47
def sample_swebench_trajectory():
48
    """Sample trajectory in SWEBench format (dict with messages array)."""
49
    return {
50
        "instance_id": "test-instance-1",
51
        "info": {
52
            "exit_status": "Submitted",
53
            "submission": "Fixed the issue",
54
            "model_stats": {"instance_cost": 0.05, "api_calls": 3},
55
        },
56
        "messages": [
57
            {"role": "system", "content": "You are a helpful assistant."},
58
            {"role": "user", "content": [{"type": "text", "text": "Please solve this issue."}]},
59
            {"role": "assistant", "content": "I'll analyze the issue.\n\n```mswea_bash_command\ncat file.py\n```"},
60
            {"role": "user", "content": [{"type": "text", "text": "File contents here."}]},
61
            {
62
                "role": "assistant",
63
                "content": "Fixed!\n\n```mswea_bash_command\necho COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT\n```",
64
            },
65
        ],
66
    }
67
 
68
 
69
@pytest.fixture
70
def sample_toolcall_trajectory():
71
    """Sample trajectory with tool_calls format."""
72
    return [
73
        {"role": "system", "content": "You are a helpful assistant."},
74
        {"role": "user", "content": "List files."},
75
        {
76
            "role": "assistant",
77
            "content": "",
78
            "tool_calls": [{"id": "1", "function": {"name": "bash", "arguments": '{"command": "ls -la"}'}}],
79
        },
80
        {"role": "tool", "tool_call_id": "1", "content": '{"returncode": 0, "output": "file.txt"}'},
81
    ]
82
 
83
 
84
@pytest.fixture
85
def sample_response_api_trajectory():
86
    """Sample trajectory with Responses API format."""
87
    return [
88
        {"role": "system", "content": "You are a helpful assistant."},
89
        {"role": "user", "content": "List files."},
90
        {
91
            "type": "assistant",
92
            "output": [
93
                {"type": "message", "content": [{"type": "text", "text": "Let me check."}]},
94
                {"type": "function_call", "name": "bash", "arguments": '{"command": "ls"}'},
95
            ],
96
        },
97
    ]
98
 
99
 
100
@pytest.fixture
101
def temp_trajectory_files(sample_simple_trajectory, sample_swebench_trajectory):
102
    """Create temporary trajectory files for testing."""
103
    with tempfile.TemporaryDirectory() as temp_dir:
104
        temp_path = Path(temp_dir)
105
 
106
        # Simple format trajectory
107
        simple_file = temp_path / "simple.traj.json"
108
        simple_file.write_text(json.dumps(sample_simple_trajectory, indent=2))
109
 
110
        # SWEBench format trajectory
111
        swebench_file = temp_path / "swebench.traj.json"
112
        swebench_file.write_text(json.dumps(sample_swebench_trajectory, indent=2))
113
 
114
        # Invalid JSON file
115
        invalid_file = temp_path / "invalid.traj.json"
116
        invalid_file.write_text("invalid json content")
117
 
118
        yield [simple_file, swebench_file, invalid_file]
119
 
120
 
121
@pytest.mark.slow
122
async def test_trajectory_inspector_basic_navigation(temp_trajectory_files):
123
    """Test basic step navigation in trajectory inspector."""
124
    valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
125
 
126
    app = TrajectoryInspector(valid_files)
127
 
128
    async with app.run_test() as pilot:
129
        # Should start with first trajectory, first step
130
        await pilot.pause(0.1)
131
        assert "Trajectory 1/2 - simple.traj.json - Step 1/3" in app.title
132
        content = get_screen_text(app)
133
        assert "SYSTEM" in content
134
        assert "You are a helpful assistant" in content
135
        assert "solve this problem" in content
136
 
137
        # Navigate to next step
138
        await pilot.press("l")
139
        assert "Step 2/3" in app.title
140
        assert "ASSISTANT" in get_screen_text(app)
141
        assert "I'll help you solve this" in get_screen_text(app)
142
 
143
        # Navigate to last step
144
        await pilot.press("$")
145
        assert "Step 3/3" in app.title
146
        assert "ASSISTANT" in get_screen_text(app)
147
        assert "echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" in get_screen_text(app)
148
 
149
        # Navigate back to first step
150
        await pilot.press("0")
151
        assert "Step 1/3" in app.title
152
        assert "SYSTEM" in get_screen_text(app)
153
 
154
        # Navigate with left/right arrows
155
        await pilot.press("right")
156
        assert "Step 2/3" in app.title
157
        await pilot.press("left")
158
        assert "Step 1/3" in app.title
159
 
160
 
161
@pytest.mark.slow
162
async def test_trajectory_inspector_trajectory_navigation(temp_trajectory_files):
163
    """Test navigation between different trajectory files."""
164
    valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
165
 
166
    app = TrajectoryInspector(valid_files)
167
 
168
    async with app.run_test() as pilot:
169
        await pilot.pause(0.1)
170
 
171
        # Should start with first trajectory
172
        assert "Trajectory 1/2 - simple.traj.json" in app.title
173
        content = get_screen_text(app)
174
        assert "You are a helpful assistant" in content
175
 
176
        # Navigate to next trajectory
177
        await pilot.press("L")
178
        assert "Trajectory 2/2 - swebench.traj.json" in app.title
179
        await pilot.pause(0.1)
180
        content = get_screen_text(app)
181
        assert "You are a helpful assistant" in content
182
 
183
        # Navigate back to previous trajectory
184
        await pilot.press("H")
185
        assert "Trajectory 1/2 - simple.traj.json" in app.title
186
 
187
        # Try to navigate beyond bounds
188
        await pilot.press("H")  # Should stay at first
189
        assert "Trajectory 1/2 - simple.traj.json" in app.title
190
 
191
        await pilot.press("L")  # Go to second
192
        await pilot.press("L")  # Try to go beyond
193
        assert "Trajectory 2/2 - swebench.traj.json" in app.title  # Should stay at last
194
 
195
 
196
@pytest.mark.slow
197
async def test_trajectory_inspector_swebench_format(temp_trajectory_files):
198
    """Test that SWEBench format trajectories are handled correctly."""
199
    valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
200
 
201
    app = TrajectoryInspector(valid_files)
202
 
203
    async with app.run_test() as pilot:
204
        # Navigate to SWEBench trajectory
205
        await pilot.press("L")
206
        await pilot.pause(0.1)
207
 
208
        assert "Trajectory 2/2 - swebench.traj.json" in app.title
209
        assert "Step 1/3" in app.title
210
 
211
        # Check that list content is properly rendered - step 1 should have the initial user message
212
        content = get_screen_text(app)
213
        assert "Please solve this issue" in content
214
 
215
 
216
@pytest.mark.slow
217
async def test_trajectory_inspector_scrolling(temp_trajectory_files):
218
    """Test scrolling behavior in trajectory inspector."""
219
    valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
220
 
221
    app = TrajectoryInspector(valid_files)
222
 
223
    async with app.run_test() as pilot:
224
        await pilot.pause(0.1)
225
 
226
        # Test scrolling
227
        vs = app.query_one("VerticalScroll")
228
        initial_y = vs.scroll_target_y
229
 
230
        await pilot.press("j")  # scroll down
231
        assert vs.scroll_target_y >= initial_y
232
 
233
        await pilot.press("k")  # scroll up
234
        # Should scroll up (may not be exactly equal due to content constraints)
235
 
236
 
237
@pytest.mark.slow
238
async def test_trajectory_inspector_empty_trajectory():
239
    """Test inspector behavior with empty trajectory list."""
240
    app = TrajectoryInspector([])
241
 
242
    async with app.run_test() as pilot:
243
        await pilot.pause(0.1)
244
 
245
        assert "Trajectory Inspector - No Data" in app.title
246
        assert "No trajectory loaded" in get_screen_text(app)
247
 
248
        # Navigation should not crash
249
        await pilot.press("l")
250
        await pilot.press("h")
251
        await pilot.press("L")
252
        await pilot.press("H")
253
 
254
 
255
async def test_trajectory_inspector_invalid_file(temp_trajectory_files):
256
    """Test inspector behavior with invalid JSON file."""
257
    invalid_file = [f for f in temp_trajectory_files if f.name == "invalid.traj.json"][0]
258
 
259
    # Mock notify to capture error messages
260
    app = TrajectoryInspector([invalid_file])
261
 
262
    # Since this is not an async run_test, we need to manually trigger the load
263
    # The error should be captured when _load_current_trajectory is called
264
    app._load_current_trajectory()
265
 
266
    assert app.messages == []
267
    assert app.steps == []
268
 
269
 
270
def test_trajectory_inspector_load_trajectory_formats(
271
    sample_simple_trajectory, sample_swebench_trajectory, sample_toolcall_trajectory, sample_response_api_trajectory
272
):
273
    """Test loading different trajectory formats."""
274
    with tempfile.TemporaryDirectory() as temp_dir:
275
        temp_path = Path(temp_dir)
276
 
277
        # Test simple format (text-based actions)
278
        simple_file = temp_path / "simple.traj.json"
279
        simple_file.write_text(json.dumps(sample_simple_trajectory))
280
        app = TrajectoryInspector([simple_file])
281
        assert len(app.messages) == 5
282
        assert len(app.steps) == 3
283
 
284
        # Test SWEBench format (dict with messages array)
285
        swebench_file = temp_path / "swebench.traj.json"
286
        swebench_file.write_text(json.dumps(sample_swebench_trajectory))
287
        app = TrajectoryInspector([swebench_file])
288
        assert len(app.messages) == 5
289
        assert len(app.steps) == 3
290
 
291
        # Test tool_calls format (OpenAI function calling)
292
        toolcall_file = temp_path / "toolcall.traj.json"
293
        toolcall_file.write_text(json.dumps(sample_toolcall_trajectory))
294
        app = TrajectoryInspector([toolcall_file])
295
        assert len(app.messages) == 4
296
        assert len(app.steps) == 2
297
 
298
        # Test Responses API format (step splitting uses 'role', not 'type')
299
        response_api_file = temp_path / "response_api.traj.json"
300
        response_api_file.write_text(json.dumps(sample_response_api_trajectory))
301
        app = TrajectoryInspector([response_api_file])
302
        assert len(app.messages) == 3
303
        assert len(app.steps) == 1
304
 
305
 
306
def test_trajectory_inspector_unrecognized_format():
307
    """Test inspector behavior with unrecognized trajectory format."""
308
    with tempfile.TemporaryDirectory() as temp_dir:
309
        temp_path = Path(temp_dir)
310
 
311
        # Create file with unrecognized format
312
        unrecognized_file = temp_path / "unrecognized.traj.json"
313
        unrecognized_file.write_text(json.dumps({"some": "other", "format": True}))
314
 
315
        app = TrajectoryInspector([unrecognized_file])
316
 
317
        # Should handle gracefully
318
        assert app.messages == []
319
        assert app.steps == []
320
 
321
 
322
def test_trajectory_inspector_current_trajectory_name():
323
    """Test current_trajectory_name property."""
324
    with tempfile.TemporaryDirectory() as temp_dir:
325
        temp_path = Path(temp_dir)
326
        test_file = temp_path / "test.traj.json"
327
        test_file.write_text(json.dumps([]))
328
 
329
        app = TrajectoryInspector([test_file])
330
        assert app.current_trajectory_name == "test.traj.json"
331
 
332
        # Test with empty trajectory list
333
        app = TrajectoryInspector([])
334
        assert app.current_trajectory_name == "No trajectories"
335
 
336
 
337
@pytest.mark.slow
338
async def test_trajectory_inspector_css_loading():
339
    """Test that CSS is properly loaded from config."""
340
    app = TrajectoryInspector([])
341
 
342
    # Verify CSS contains expected styles
343
    assert ".message-container" in app.CSS
344
    assert ".message-header" in app.CSS
345
    assert ".message-content" in app.CSS
346
 
347
 
348
@pytest.mark.slow
349
async def test_trajectory_inspector_quit_binding(temp_trajectory_files):
350
    """Test quit functionality."""
351
    valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
352
 
353
    app = TrajectoryInspector(valid_files)
354
 
355
    async with app.run_test() as pilot:
356
        await pilot.pause(0.1)
357
 
358
        # Test quit functionality
359
        await pilot.press("q")
360
        await pilot.pause(0.1)
361
 
362
        # App should exit gracefully (the test framework handles this)
363
 
364
 
365
def test_trajectory_inspector_binding_labels():
366
    """Test that binding labels use arrow symbols."""
367
    bindings = {b.action: b.description for b in TrajectoryInspector.BINDINGS}
368
    assert bindings["scroll_down"] == "↓"
369
    assert bindings["scroll_up"] == "↑"
370
 
371
 
372
@patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
373
def test_main_with_single_file(mock_run, temp_trajectory_files):
374
    """Test main function with a single trajectory file."""
375
    valid_file = temp_trajectory_files[0]  # simple.traj.json
376
 
377
    main(str(valid_file))
378
 
379
    mock_run.assert_called_once()
380
    # Verify the inspector was created with the correct file
381
    assert mock_run.call_count == 1
382
 
383
 
384
@patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
385
def test_main_with_directory_containing_trajectories(mock_run, temp_trajectory_files):
386
    """Test main function with a directory containing trajectory files."""
387
    directory = temp_trajectory_files[0].parent
388
 
389
    main(str(directory))
390
 
391
    mock_run.assert_called_once()
392
 
393
 
394
@patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
395
def test_main_with_directory_no_trajectories(mock_run):
396
    """Test main function with a directory containing no trajectory files."""
397
    with tempfile.TemporaryDirectory() as temp_dir:
398
        # Create some non-trajectory files
399
        temp_path = Path(temp_dir)
400
        (temp_path / "other.json").write_text('{"not": "trajectory"}')
401
        (temp_path / "readme.txt").write_text("some text")
402
 
403
        with pytest.raises(typer.BadParameter, match="No trajectory files found"):
404
            main(str(temp_dir))
405
 
406
        mock_run.assert_not_called()
407
 
408
 
409
@patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
410
def test_main_with_nonexistent_path(mock_run):
411
    """Test main function with a path that doesn't exist."""
412
    nonexistent_path = "/this/path/does/not/exist"
413
 
414
    with pytest.raises(typer.BadParameter, match="Path .* does not exist"):
415
        main(nonexistent_path)
416
 
417
    mock_run.assert_not_called()
418
 
419
 
420
@patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
421
def test_main_with_current_directory_default(mock_run, temp_trajectory_files):
422
    """Test main function with default argument (current directory)."""
423
    directory = temp_trajectory_files[0].parent
424
 
425
    # Change to the temp directory to test the default "." behavior
426
    import os
427
 
428
    original_cwd = os.getcwd()
429
    try:
430
        os.chdir(str(directory))
431
        main(".")  # Explicitly test with "." since default is handled by typer
432
        mock_run.assert_called_once()
433
    finally:
434
        os.chdir(original_cwd)
435
 
435 lines