| 1 | import json
|
| 2 | import tempfile
|
| 3 | from pathlib import Path
|
| 4 | from unittest.mock import patch
|
| 5 |
|
| 6 | import pytest
|
| 7 | import typer
|
| 8 |
|
| 9 | from minisweagent.run.utilities.inspector import TrajectoryInspector, main
|
| 10 |
|
| 11 |
|
| 12 | def get_screen_text(app: TrajectoryInspector) -> str:
|
| 13 | """Extract all text content from the app's UI."""
|
| 14 | text_parts = []
|
| 15 |
|
| 16 | def _append_visible_static_text(container):
|
| 17 | for static_widget in container.query("Static"):
|
| 18 | if static_widget.display:
|
| 19 | if hasattr(static_widget, "content") and static_widget.content: # type: ignore[attr-defined]
|
| 20 | text_parts.append(str(static_widget.content)) # type: ignore[attr-defined]
|
| 21 | elif hasattr(static_widget, "renderable") and static_widget.renderable: # type: ignore[attr-defined]
|
| 22 | text_parts.append(str(static_widget.renderable)) # type: ignore[attr-defined]
|
| 23 |
|
| 24 | # Get all Static widgets in the main content container
|
| 25 | content_container = app.query_one("#content")
|
| 26 | _append_visible_static_text(content_container)
|
| 27 |
|
| 28 | return "\n".join(text_parts)
|
| 29 |
|
| 30 |
|
| 31 | @pytest.fixture
|
| 32 | def sample_simple_trajectory():
|
| 33 | """Sample trajectory in simple format (list of messages)."""
|
| 34 | return [
|
| 35 | {"role": "system", "content": "You are a helpful assistant."},
|
| 36 | {"role": "user", "content": "Hello, solve this problem."},
|
| 37 | {"role": "assistant", "content": "I'll help you solve this.\n\n```mswea_bash_command\nls -la\n```"},
|
| 38 | {"role": "user", "content": "Command output here."},
|
| 39 | {
|
| 40 | "role": "assistant",
|
| 41 | "content": "Now I'll finish.\n\n```mswea_bash_command\necho COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT\n```",
|
| 42 | },
|
| 43 | ]
|
| 44 |
|
| 45 |
|
| 46 | @pytest.fixture
|
| 47 | def sample_swebench_trajectory():
|
| 48 | """Sample trajectory in SWEBench format (dict with messages array)."""
|
| 49 | return {
|
| 50 | "instance_id": "test-instance-1",
|
| 51 | "info": {
|
| 52 | "exit_status": "Submitted",
|
| 53 | "submission": "Fixed the issue",
|
| 54 | "model_stats": {"instance_cost": 0.05, "api_calls": 3},
|
| 55 | },
|
| 56 | "messages": [
|
| 57 | {"role": "system", "content": "You are a helpful assistant."},
|
| 58 | {"role": "user", "content": [{"type": "text", "text": "Please solve this issue."}]},
|
| 59 | {"role": "assistant", "content": "I'll analyze the issue.\n\n```mswea_bash_command\ncat file.py\n```"},
|
| 60 | {"role": "user", "content": [{"type": "text", "text": "File contents here."}]},
|
| 61 | {
|
| 62 | "role": "assistant",
|
| 63 | "content": "Fixed!\n\n```mswea_bash_command\necho COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT\n```",
|
| 64 | },
|
| 65 | ],
|
| 66 | }
|
| 67 |
|
| 68 |
|
| 69 | @pytest.fixture
|
| 70 | def sample_toolcall_trajectory():
|
| 71 | """Sample trajectory with tool_calls format."""
|
| 72 | return [
|
| 73 | {"role": "system", "content": "You are a helpful assistant."},
|
| 74 | {"role": "user", "content": "List files."},
|
| 75 | {
|
| 76 | "role": "assistant",
|
| 77 | "content": "",
|
| 78 | "tool_calls": [{"id": "1", "function": {"name": "bash", "arguments": '{"command": "ls -la"}'}}],
|
| 79 | },
|
| 80 | {"role": "tool", "tool_call_id": "1", "content": '{"returncode": 0, "output": "file.txt"}'},
|
| 81 | ]
|
| 82 |
|
| 83 |
|
| 84 | @pytest.fixture
|
| 85 | def sample_response_api_trajectory():
|
| 86 | """Sample trajectory with Responses API format."""
|
| 87 | return [
|
| 88 | {"role": "system", "content": "You are a helpful assistant."},
|
| 89 | {"role": "user", "content": "List files."},
|
| 90 | {
|
| 91 | "type": "assistant",
|
| 92 | "output": [
|
| 93 | {"type": "message", "content": [{"type": "text", "text": "Let me check."}]},
|
| 94 | {"type": "function_call", "name": "bash", "arguments": '{"command": "ls"}'},
|
| 95 | ],
|
| 96 | },
|
| 97 | ]
|
| 98 |
|
| 99 |
|
| 100 | @pytest.fixture
|
| 101 | def temp_trajectory_files(sample_simple_trajectory, sample_swebench_trajectory):
|
| 102 | """Create temporary trajectory files for testing."""
|
| 103 | with tempfile.TemporaryDirectory() as temp_dir:
|
| 104 | temp_path = Path(temp_dir)
|
| 105 |
|
| 106 | # Simple format trajectory
|
| 107 | simple_file = temp_path / "simple.traj.json"
|
| 108 | simple_file.write_text(json.dumps(sample_simple_trajectory, indent=2))
|
| 109 |
|
| 110 | # SWEBench format trajectory
|
| 111 | swebench_file = temp_path / "swebench.traj.json"
|
| 112 | swebench_file.write_text(json.dumps(sample_swebench_trajectory, indent=2))
|
| 113 |
|
| 114 | # Invalid JSON file
|
| 115 | invalid_file = temp_path / "invalid.traj.json"
|
| 116 | invalid_file.write_text("invalid json content")
|
| 117 |
|
| 118 | yield [simple_file, swebench_file, invalid_file]
|
| 119 |
|
| 120 |
|
| 121 | @pytest.mark.slow
|
| 122 | async def test_trajectory_inspector_basic_navigation(temp_trajectory_files):
|
| 123 | """Test basic step navigation in trajectory inspector."""
|
| 124 | valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
|
| 125 |
|
| 126 | app = TrajectoryInspector(valid_files)
|
| 127 |
|
| 128 | async with app.run_test() as pilot:
|
| 129 | # Should start with first trajectory, first step
|
| 130 | await pilot.pause(0.1)
|
| 131 | assert "Trajectory 1/2 - simple.traj.json - Step 1/3" in app.title
|
| 132 | content = get_screen_text(app)
|
| 133 | assert "SYSTEM" in content
|
| 134 | assert "You are a helpful assistant" in content
|
| 135 | assert "solve this problem" in content
|
| 136 |
|
| 137 | # Navigate to next step
|
| 138 | await pilot.press("l")
|
| 139 | assert "Step 2/3" in app.title
|
| 140 | assert "ASSISTANT" in get_screen_text(app)
|
| 141 | assert "I'll help you solve this" in get_screen_text(app)
|
| 142 |
|
| 143 | # Navigate to last step
|
| 144 | await pilot.press("$")
|
| 145 | assert "Step 3/3" in app.title
|
| 146 | assert "ASSISTANT" in get_screen_text(app)
|
| 147 | assert "echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" in get_screen_text(app)
|
| 148 |
|
| 149 | # Navigate back to first step
|
| 150 | await pilot.press("0")
|
| 151 | assert "Step 1/3" in app.title
|
| 152 | assert "SYSTEM" in get_screen_text(app)
|
| 153 |
|
| 154 | # Navigate with left/right arrows
|
| 155 | await pilot.press("right")
|
| 156 | assert "Step 2/3" in app.title
|
| 157 | await pilot.press("left")
|
| 158 | assert "Step 1/3" in app.title
|
| 159 |
|
| 160 |
|
| 161 | @pytest.mark.slow
|
| 162 | async def test_trajectory_inspector_trajectory_navigation(temp_trajectory_files):
|
| 163 | """Test navigation between different trajectory files."""
|
| 164 | valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
|
| 165 |
|
| 166 | app = TrajectoryInspector(valid_files)
|
| 167 |
|
| 168 | async with app.run_test() as pilot:
|
| 169 | await pilot.pause(0.1)
|
| 170 |
|
| 171 | # Should start with first trajectory
|
| 172 | assert "Trajectory 1/2 - simple.traj.json" in app.title
|
| 173 | content = get_screen_text(app)
|
| 174 | assert "You are a helpful assistant" in content
|
| 175 |
|
| 176 | # Navigate to next trajectory
|
| 177 | await pilot.press("L")
|
| 178 | assert "Trajectory 2/2 - swebench.traj.json" in app.title
|
| 179 | await pilot.pause(0.1)
|
| 180 | content = get_screen_text(app)
|
| 181 | assert "You are a helpful assistant" in content
|
| 182 |
|
| 183 | # Navigate back to previous trajectory
|
| 184 | await pilot.press("H")
|
| 185 | assert "Trajectory 1/2 - simple.traj.json" in app.title
|
| 186 |
|
| 187 | # Try to navigate beyond bounds
|
| 188 | await pilot.press("H") # Should stay at first
|
| 189 | assert "Trajectory 1/2 - simple.traj.json" in app.title
|
| 190 |
|
| 191 | await pilot.press("L") # Go to second
|
| 192 | await pilot.press("L") # Try to go beyond
|
| 193 | assert "Trajectory 2/2 - swebench.traj.json" in app.title # Should stay at last
|
| 194 |
|
| 195 |
|
| 196 | @pytest.mark.slow
|
| 197 | async def test_trajectory_inspector_swebench_format(temp_trajectory_files):
|
| 198 | """Test that SWEBench format trajectories are handled correctly."""
|
| 199 | valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
|
| 200 |
|
| 201 | app = TrajectoryInspector(valid_files)
|
| 202 |
|
| 203 | async with app.run_test() as pilot:
|
| 204 | # Navigate to SWEBench trajectory
|
| 205 | await pilot.press("L")
|
| 206 | await pilot.pause(0.1)
|
| 207 |
|
| 208 | assert "Trajectory 2/2 - swebench.traj.json" in app.title
|
| 209 | assert "Step 1/3" in app.title
|
| 210 |
|
| 211 | # Check that list content is properly rendered - step 1 should have the initial user message
|
| 212 | content = get_screen_text(app)
|
| 213 | assert "Please solve this issue" in content
|
| 214 |
|
| 215 |
|
| 216 | @pytest.mark.slow
|
| 217 | async def test_trajectory_inspector_scrolling(temp_trajectory_files):
|
| 218 | """Test scrolling behavior in trajectory inspector."""
|
| 219 | valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
|
| 220 |
|
| 221 | app = TrajectoryInspector(valid_files)
|
| 222 |
|
| 223 | async with app.run_test() as pilot:
|
| 224 | await pilot.pause(0.1)
|
| 225 |
|
| 226 | # Test scrolling
|
| 227 | vs = app.query_one("VerticalScroll")
|
| 228 | initial_y = vs.scroll_target_y
|
| 229 |
|
| 230 | await pilot.press("j") # scroll down
|
| 231 | assert vs.scroll_target_y >= initial_y
|
| 232 |
|
| 233 | await pilot.press("k") # scroll up
|
| 234 | # Should scroll up (may not be exactly equal due to content constraints)
|
| 235 |
|
| 236 |
|
| 237 | @pytest.mark.slow
|
| 238 | async def test_trajectory_inspector_empty_trajectory():
|
| 239 | """Test inspector behavior with empty trajectory list."""
|
| 240 | app = TrajectoryInspector([])
|
| 241 |
|
| 242 | async with app.run_test() as pilot:
|
| 243 | await pilot.pause(0.1)
|
| 244 |
|
| 245 | assert "Trajectory Inspector - No Data" in app.title
|
| 246 | assert "No trajectory loaded" in get_screen_text(app)
|
| 247 |
|
| 248 | # Navigation should not crash
|
| 249 | await pilot.press("l")
|
| 250 | await pilot.press("h")
|
| 251 | await pilot.press("L")
|
| 252 | await pilot.press("H")
|
| 253 |
|
| 254 |
|
| 255 | async def test_trajectory_inspector_invalid_file(temp_trajectory_files):
|
| 256 | """Test inspector behavior with invalid JSON file."""
|
| 257 | invalid_file = [f for f in temp_trajectory_files if f.name == "invalid.traj.json"][0]
|
| 258 |
|
| 259 | # Mock notify to capture error messages
|
| 260 | app = TrajectoryInspector([invalid_file])
|
| 261 |
|
| 262 | # Since this is not an async run_test, we need to manually trigger the load
|
| 263 | # The error should be captured when _load_current_trajectory is called
|
| 264 | app._load_current_trajectory()
|
| 265 |
|
| 266 | assert app.messages == []
|
| 267 | assert app.steps == []
|
| 268 |
|
| 269 |
|
| 270 | def test_trajectory_inspector_load_trajectory_formats(
|
| 271 | sample_simple_trajectory, sample_swebench_trajectory, sample_toolcall_trajectory, sample_response_api_trajectory
|
| 272 | ):
|
| 273 | """Test loading different trajectory formats."""
|
| 274 | with tempfile.TemporaryDirectory() as temp_dir:
|
| 275 | temp_path = Path(temp_dir)
|
| 276 |
|
| 277 | # Test simple format (text-based actions)
|
| 278 | simple_file = temp_path / "simple.traj.json"
|
| 279 | simple_file.write_text(json.dumps(sample_simple_trajectory))
|
| 280 | app = TrajectoryInspector([simple_file])
|
| 281 | assert len(app.messages) == 5
|
| 282 | assert len(app.steps) == 3
|
| 283 |
|
| 284 | # Test SWEBench format (dict with messages array)
|
| 285 | swebench_file = temp_path / "swebench.traj.json"
|
| 286 | swebench_file.write_text(json.dumps(sample_swebench_trajectory))
|
| 287 | app = TrajectoryInspector([swebench_file])
|
| 288 | assert len(app.messages) == 5
|
| 289 | assert len(app.steps) == 3
|
| 290 |
|
| 291 | # Test tool_calls format (OpenAI function calling)
|
| 292 | toolcall_file = temp_path / "toolcall.traj.json"
|
| 293 | toolcall_file.write_text(json.dumps(sample_toolcall_trajectory))
|
| 294 | app = TrajectoryInspector([toolcall_file])
|
| 295 | assert len(app.messages) == 4
|
| 296 | assert len(app.steps) == 2
|
| 297 |
|
| 298 | # Test Responses API format (step splitting uses 'role', not 'type')
|
| 299 | response_api_file = temp_path / "response_api.traj.json"
|
| 300 | response_api_file.write_text(json.dumps(sample_response_api_trajectory))
|
| 301 | app = TrajectoryInspector([response_api_file])
|
| 302 | assert len(app.messages) == 3
|
| 303 | assert len(app.steps) == 1
|
| 304 |
|
| 305 |
|
| 306 | def test_trajectory_inspector_unrecognized_format():
|
| 307 | """Test inspector behavior with unrecognized trajectory format."""
|
| 308 | with tempfile.TemporaryDirectory() as temp_dir:
|
| 309 | temp_path = Path(temp_dir)
|
| 310 |
|
| 311 | # Create file with unrecognized format
|
| 312 | unrecognized_file = temp_path / "unrecognized.traj.json"
|
| 313 | unrecognized_file.write_text(json.dumps({"some": "other", "format": True}))
|
| 314 |
|
| 315 | app = TrajectoryInspector([unrecognized_file])
|
| 316 |
|
| 317 | # Should handle gracefully
|
| 318 | assert app.messages == []
|
| 319 | assert app.steps == []
|
| 320 |
|
| 321 |
|
| 322 | def test_trajectory_inspector_current_trajectory_name():
|
| 323 | """Test current_trajectory_name property."""
|
| 324 | with tempfile.TemporaryDirectory() as temp_dir:
|
| 325 | temp_path = Path(temp_dir)
|
| 326 | test_file = temp_path / "test.traj.json"
|
| 327 | test_file.write_text(json.dumps([]))
|
| 328 |
|
| 329 | app = TrajectoryInspector([test_file])
|
| 330 | assert app.current_trajectory_name == "test.traj.json"
|
| 331 |
|
| 332 | # Test with empty trajectory list
|
| 333 | app = TrajectoryInspector([])
|
| 334 | assert app.current_trajectory_name == "No trajectories"
|
| 335 |
|
| 336 |
|
| 337 | @pytest.mark.slow
|
| 338 | async def test_trajectory_inspector_css_loading():
|
| 339 | """Test that CSS is properly loaded from config."""
|
| 340 | app = TrajectoryInspector([])
|
| 341 |
|
| 342 | # Verify CSS contains expected styles
|
| 343 | assert ".message-container" in app.CSS
|
| 344 | assert ".message-header" in app.CSS
|
| 345 | assert ".message-content" in app.CSS
|
| 346 |
|
| 347 |
|
| 348 | @pytest.mark.slow
|
| 349 | async def test_trajectory_inspector_quit_binding(temp_trajectory_files):
|
| 350 | """Test quit functionality."""
|
| 351 | valid_files = [f for f in temp_trajectory_files if f.name != "invalid.traj.json"]
|
| 352 |
|
| 353 | app = TrajectoryInspector(valid_files)
|
| 354 |
|
| 355 | async with app.run_test() as pilot:
|
| 356 | await pilot.pause(0.1)
|
| 357 |
|
| 358 | # Test quit functionality
|
| 359 | await pilot.press("q")
|
| 360 | await pilot.pause(0.1)
|
| 361 |
|
| 362 | # App should exit gracefully (the test framework handles this)
|
| 363 |
|
| 364 |
|
| 365 | def test_trajectory_inspector_binding_labels():
|
| 366 | """Test that binding labels use arrow symbols."""
|
| 367 | bindings = {b.action: b.description for b in TrajectoryInspector.BINDINGS}
|
| 368 | assert bindings["scroll_down"] == "↓"
|
| 369 | assert bindings["scroll_up"] == "↑"
|
| 370 |
|
| 371 |
|
| 372 | @patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
|
| 373 | def test_main_with_single_file(mock_run, temp_trajectory_files):
|
| 374 | """Test main function with a single trajectory file."""
|
| 375 | valid_file = temp_trajectory_files[0] # simple.traj.json
|
| 376 |
|
| 377 | main(str(valid_file))
|
| 378 |
|
| 379 | mock_run.assert_called_once()
|
| 380 | # Verify the inspector was created with the correct file
|
| 381 | assert mock_run.call_count == 1
|
| 382 |
|
| 383 |
|
| 384 | @patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
|
| 385 | def test_main_with_directory_containing_trajectories(mock_run, temp_trajectory_files):
|
| 386 | """Test main function with a directory containing trajectory files."""
|
| 387 | directory = temp_trajectory_files[0].parent
|
| 388 |
|
| 389 | main(str(directory))
|
| 390 |
|
| 391 | mock_run.assert_called_once()
|
| 392 |
|
| 393 |
|
| 394 | @patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
|
| 395 | def test_main_with_directory_no_trajectories(mock_run):
|
| 396 | """Test main function with a directory containing no trajectory files."""
|
| 397 | with tempfile.TemporaryDirectory() as temp_dir:
|
| 398 | # Create some non-trajectory files
|
| 399 | temp_path = Path(temp_dir)
|
| 400 | (temp_path / "other.json").write_text('{"not": "trajectory"}')
|
| 401 | (temp_path / "readme.txt").write_text("some text")
|
| 402 |
|
| 403 | with pytest.raises(typer.BadParameter, match="No trajectory files found"):
|
| 404 | main(str(temp_dir))
|
| 405 |
|
| 406 | mock_run.assert_not_called()
|
| 407 |
|
| 408 |
|
| 409 | @patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
|
| 410 | def test_main_with_nonexistent_path(mock_run):
|
| 411 | """Test main function with a path that doesn't exist."""
|
| 412 | nonexistent_path = "/this/path/does/not/exist"
|
| 413 |
|
| 414 | with pytest.raises(typer.BadParameter, match="Path .* does not exist"):
|
| 415 | main(nonexistent_path)
|
| 416 |
|
| 417 | mock_run.assert_not_called()
|
| 418 |
|
| 419 |
|
| 420 | @patch("minisweagent.run.utilities.inspector.TrajectoryInspector.run")
|
| 421 | def test_main_with_current_directory_default(mock_run, temp_trajectory_files):
|
| 422 | """Test main function with default argument (current directory)."""
|
| 423 | directory = temp_trajectory_files[0].parent
|
| 424 |
|
| 425 | # Change to the temp directory to test the default "." behavior
|
| 426 | import os
|
| 427 |
|
| 428 | original_cwd = os.getcwd()
|
| 429 | try:
|
| 430 | os.chdir(str(directory))
|
| 431 | main(".") # Explicitly test with "." since default is handled by typer
|
| 432 | mock_run.assert_called_once()
|
| 433 | finally:
|
| 434 | os.chdir(original_cwd)
|
| 435 |
|