| 1 | import json
|
| 2 | import re
|
| 3 | from unittest.mock import patch
|
| 4 |
|
| 5 | import pytest
|
| 6 | from pydantic import BaseModel
|
| 7 |
|
| 8 | from minisweagent import package_dir
|
| 9 | from minisweagent.models.test_models import DeterministicModel, make_output
|
| 10 | from minisweagent.run.benchmarks.swebench import (
|
| 11 | filter_instances,
|
| 12 | get_swebench_docker_image_name,
|
| 13 | main,
|
| 14 | remove_from_preds_file,
|
| 15 | update_preds_file,
|
| 16 | )
|
| 17 |
|
| 18 |
|
| 19 | def _make_model_from_fixture(text_outputs: list[str], cost_per_call: float = 1.0, **kwargs) -> DeterministicModel:
|
| 20 | """Create a DeterministicModel from trajectory fixture data (raw text outputs)."""
|
| 21 |
|
| 22 | def parse_command(text: str) -> list[dict]:
|
| 23 | match = re.search(r"```mswea_bash_command\s*\n(.*?)\n```", text, re.DOTALL)
|
| 24 | return [{"command": match.group(1)}] if match else []
|
| 25 |
|
| 26 | return DeterministicModel(
|
| 27 | outputs=[make_output(text, parse_command(text), cost=cost_per_call) for text in text_outputs],
|
| 28 | cost_per_call=cost_per_call,
|
| 29 | **kwargs,
|
| 30 | )
|
| 31 |
|
| 32 |
|
| 33 | @pytest.mark.slow
|
| 34 | @pytest.mark.parametrize("workers", [1, 2])
|
| 35 | def test_swebench_end_to_end(github_test_data, tmp_path, workers):
|
| 36 | """Test the complete SWEBench flow using the _test subset with deterministic model"""
|
| 37 |
|
| 38 | model_responses = github_test_data["model_responses"]
|
| 39 |
|
| 40 | with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
|
| 41 | # Use side_effect to create a new model instance for each worker
|
| 42 | mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses, cost_per_call=0.1)
|
| 43 |
|
| 44 | main(
|
| 45 | subset="_test",
|
| 46 | split="test",
|
| 47 | slice_spec="0:1",
|
| 48 | output=str(tmp_path),
|
| 49 | workers=workers,
|
| 50 | filter_spec="swe-agent__test-repo-1",
|
| 51 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 52 | environment_class="docker",
|
| 53 | )
|
| 54 |
|
| 55 | traj_file_path = package_dir.parent.parent / "tests" / "test_data" / "github_issue.traj.json"
|
| 56 | trajectory = json.loads(traj_file_path.read_text())
|
| 57 |
|
| 58 | last_message = trajectory[-1]["content"]
|
| 59 |
|
| 60 | instance_id = "swe-agent__test-repo-1"
|
| 61 | expected_result = {
|
| 62 | instance_id: {
|
| 63 | "model_name_or_path": "deterministic",
|
| 64 | "instance_id": instance_id,
|
| 65 | "model_patch": last_message,
|
| 66 | }
|
| 67 | }
|
| 68 |
|
| 69 | with open(tmp_path / "preds.json") as f:
|
| 70 | actual_result = json.load(f)
|
| 71 |
|
| 72 | assert actual_result == expected_result
|
| 73 |
|
| 74 | traj_output_file = tmp_path / instance_id / f"{instance_id}.traj.json"
|
| 75 | output_trajectory = json.loads(traj_output_file.read_text())
|
| 76 | assert output_trajectory["messages"][-1]["content"] == last_message
|
| 77 |
|
| 78 |
|
| 79 | def test_get_image_name_with_existing_image_name():
|
| 80 | """Test get_image_name when image_name is already provided"""
|
| 81 | instance = {"image_name": "custom/image:tag", "instance_id": "test__repo__1"}
|
| 82 | assert get_swebench_docker_image_name(instance) == "custom/image:tag"
|
| 83 |
|
| 84 |
|
| 85 | def test_get_image_name_without_image_name():
|
| 86 | """Test get_image_name when image_name needs to be constructed"""
|
| 87 | instance = {"instance_id": "swe-agent__test-repo__1"}
|
| 88 | expected = "docker.io/swebench/sweb.eval.x86_64.swe-agent_1776_test-repo_1776_1:latest"
|
| 89 | assert get_swebench_docker_image_name(instance) == expected
|
| 90 |
|
| 91 |
|
| 92 | def test_get_image_name_with_none_image_name():
|
| 93 | """Test get_image_name when image_name is explicitly None"""
|
| 94 | instance = {"image_name": None, "instance_id": "django__django__4.0"}
|
| 95 | expected = "docker.io/swebench/sweb.eval.x86_64.django_1776_django_1776_4.0:latest"
|
| 96 | assert get_swebench_docker_image_name(instance) == expected
|
| 97 |
|
| 98 |
|
| 99 | def test_get_image_name_with_complex_instance_id():
|
| 100 | """Test get_image_name with complex instance_id containing multiple double underscores"""
|
| 101 | instance = {"instance_id": "project__sub__module__version__1.2.3"}
|
| 102 | expected = "docker.io/swebench/sweb.eval.x86_64.project_1776_sub_1776_module_1776_version_1776_1.2.3:latest"
|
| 103 | assert get_swebench_docker_image_name(instance) == expected
|
| 104 |
|
| 105 |
|
| 106 | def test_filter_instances_no_filters():
|
| 107 | """Test filter_instances with no filtering applied"""
|
| 108 | instances = [{"instance_id": "repo1__test1"}, {"instance_id": "repo2__test2"}, {"instance_id": "repo3__test3"}]
|
| 109 | result = filter_instances(instances, filter_spec="", slice_spec="")
|
| 110 | assert result == instances
|
| 111 |
|
| 112 |
|
| 113 | def test_filter_instances_regex_filter():
|
| 114 | """Test filter_instances with regex filtering"""
|
| 115 | instances = [
|
| 116 | {"instance_id": "django__test1"},
|
| 117 | {"instance_id": "flask__test2"},
|
| 118 | {"instance_id": "django__test3"},
|
| 119 | {"instance_id": "requests__test4"},
|
| 120 | ]
|
| 121 | result = filter_instances(instances, filter_spec=r"django__.*", slice_spec="")
|
| 122 | expected = [{"instance_id": "django__test1"}, {"instance_id": "django__test3"}]
|
| 123 | assert result == expected
|
| 124 |
|
| 125 |
|
| 126 | def test_filter_instances_slice_only():
|
| 127 | """Test filter_instances with slice specification"""
|
| 128 | instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(10)]
|
| 129 | result = filter_instances(instances, filter_spec="", slice_spec="2:5")
|
| 130 | expected = [{"instance_id": "repo2__test2"}, {"instance_id": "repo3__test3"}, {"instance_id": "repo4__test4"}]
|
| 131 | assert result == expected
|
| 132 |
|
| 133 |
|
| 134 | def test_filter_instances_slice_start_only():
|
| 135 | """Test filter_instances with slice start only"""
|
| 136 | instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(5)]
|
| 137 | result = filter_instances(instances, filter_spec="", slice_spec="3:")
|
| 138 | expected = [{"instance_id": "repo3__test3"}, {"instance_id": "repo4__test4"}]
|
| 139 | assert result == expected
|
| 140 |
|
| 141 |
|
| 142 | def test_filter_instances_slice_end_only():
|
| 143 | """Test filter_instances with slice end only"""
|
| 144 | instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(5)]
|
| 145 | result = filter_instances(instances, filter_spec="", slice_spec=":2")
|
| 146 | expected = [{"instance_id": "repo0__test0"}, {"instance_id": "repo1__test1"}]
|
| 147 | assert result == expected
|
| 148 |
|
| 149 |
|
| 150 | def test_filter_instances_filter_and_slice():
|
| 151 | """Test filter_instances with both filtering and slicing"""
|
| 152 | instances = [
|
| 153 | {"instance_id": "django__test1"},
|
| 154 | {"instance_id": "flask__test2"},
|
| 155 | {"instance_id": "django__test3"},
|
| 156 | {"instance_id": "django__test4"},
|
| 157 | {"instance_id": "requests__test5"},
|
| 158 | ]
|
| 159 | result = filter_instances(instances, filter_spec=r"django__.*", slice_spec="1:3")
|
| 160 | expected = [{"instance_id": "django__test3"}, {"instance_id": "django__test4"}]
|
| 161 | assert result == expected
|
| 162 |
|
| 163 |
|
| 164 | def test_filter_instances_shuffle():
|
| 165 | """Test filter_instances with shuffle enabled produces deterministic results"""
|
| 166 | instances = [{"instance_id": f"repo{i:02d}__test{i}"} for i in range(10)]
|
| 167 | # Test that shuffle produces same result with same seed
|
| 168 | result1 = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=True)
|
| 169 | result2 = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=True)
|
| 170 | assert result1 == result2
|
| 171 | # Test that shuffled result is different from original order
|
| 172 | result_no_shuffle = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=False)
|
| 173 | assert result1 != result_no_shuffle
|
| 174 |
|
| 175 |
|
| 176 | def test_filter_instances_empty_list():
|
| 177 | """Test filter_instances with empty input list"""
|
| 178 | result = filter_instances([], filter_spec=r".*", slice_spec="0:5", shuffle=True)
|
| 179 | assert result == []
|
| 180 |
|
| 181 |
|
| 182 | def test_filter_instances_no_matches():
|
| 183 | """Test filter_instances when regex matches nothing"""
|
| 184 | instances = [{"instance_id": "django__test1"}, {"instance_id": "flask__test2"}]
|
| 185 | result = filter_instances(instances, filter_spec=r"nonexistent__.*", slice_spec="")
|
| 186 | assert result == []
|
| 187 |
|
| 188 |
|
| 189 | def test_update_preds_file_new_file(tmp_path):
|
| 190 | """Test update_preds_file when output file doesn't exist"""
|
| 191 | output_path = tmp_path / "preds.json"
|
| 192 | update_preds_file(output_path, "test__instance__1", "test_model", "test_result")
|
| 193 |
|
| 194 | assert output_path.exists()
|
| 195 | result = json.loads(output_path.read_text())
|
| 196 | expected = {
|
| 197 | "test__instance__1": {
|
| 198 | "model_name_or_path": "test_model",
|
| 199 | "instance_id": "test__instance__1",
|
| 200 | "model_patch": "test_result",
|
| 201 | }
|
| 202 | }
|
| 203 | assert result == expected
|
| 204 |
|
| 205 |
|
| 206 | def test_update_preds_file_existing_file(tmp_path):
|
| 207 | """Test update_preds_file when output file already exists"""
|
| 208 | output_path = tmp_path / "preds.json"
|
| 209 |
|
| 210 | # Create initial file with one instance
|
| 211 | initial_data = {
|
| 212 | "existing__instance": {
|
| 213 | "model_name_or_path": "old_model",
|
| 214 | "instance_id": "existing__instance",
|
| 215 | "model_patch": "old_result",
|
| 216 | }
|
| 217 | }
|
| 218 | output_path.write_text(json.dumps(initial_data))
|
| 219 |
|
| 220 | # Add new instance
|
| 221 | update_preds_file(output_path, "new__instance", "new_model", "new_result")
|
| 222 |
|
| 223 | result = json.loads(output_path.read_text())
|
| 224 | expected = {
|
| 225 | "existing__instance": {
|
| 226 | "model_name_or_path": "old_model",
|
| 227 | "instance_id": "existing__instance",
|
| 228 | "model_patch": "old_result",
|
| 229 | },
|
| 230 | "new__instance": {
|
| 231 | "model_name_or_path": "new_model",
|
| 232 | "instance_id": "new__instance",
|
| 233 | "model_patch": "new_result",
|
| 234 | },
|
| 235 | }
|
| 236 | assert result == expected
|
| 237 |
|
| 238 |
|
| 239 | def test_update_preds_file_overwrite_existing(tmp_path):
|
| 240 | """Test update_preds_file overwrites existing instance"""
|
| 241 | output_path = tmp_path / "preds.json"
|
| 242 |
|
| 243 | # Create initial file
|
| 244 | initial_data = {
|
| 245 | "test__instance": {
|
| 246 | "model_name_or_path": "old_model",
|
| 247 | "instance_id": "test__instance",
|
| 248 | "model_patch": "old_result",
|
| 249 | }
|
| 250 | }
|
| 251 | output_path.write_text(json.dumps(initial_data))
|
| 252 |
|
| 253 | # Update existing instance
|
| 254 | update_preds_file(output_path, "test__instance", "new_model", "new_result")
|
| 255 |
|
| 256 | result = json.loads(output_path.read_text())
|
| 257 | expected = {
|
| 258 | "test__instance": {
|
| 259 | "model_name_or_path": "new_model",
|
| 260 | "instance_id": "test__instance",
|
| 261 | "model_patch": "new_result",
|
| 262 | }
|
| 263 | }
|
| 264 | assert result == expected
|
| 265 |
|
| 266 |
|
| 267 | def test_remove_from_preds_file_existing(tmp_path):
|
| 268 | """Test remove_from_preds_file removes existing instance"""
|
| 269 | output_path = tmp_path / "preds.json"
|
| 270 |
|
| 271 | # Create file with multiple instances
|
| 272 | initial_data = {
|
| 273 | "instance1": {"model_name_or_path": "model1", "instance_id": "instance1", "model_patch": "result1"},
|
| 274 | "instance2": {"model_name_or_path": "model2", "instance_id": "instance2", "model_patch": "result2"},
|
| 275 | }
|
| 276 | output_path.write_text(json.dumps(initial_data))
|
| 277 |
|
| 278 | # Remove one instance
|
| 279 | remove_from_preds_file(output_path, "instance1")
|
| 280 |
|
| 281 | result = json.loads(output_path.read_text())
|
| 282 | expected = {"instance2": {"model_name_or_path": "model2", "instance_id": "instance2", "model_patch": "result2"}}
|
| 283 | assert result == expected
|
| 284 |
|
| 285 |
|
| 286 | def test_remove_from_preds_file_nonexistent_instance(tmp_path):
|
| 287 | """Test remove_from_preds_file with nonexistent instance"""
|
| 288 | output_path = tmp_path / "preds.json"
|
| 289 |
|
| 290 | initial_data = {"instance1": {"model_name_or_path": "model1", "instance_id": "instance1", "model_patch": "result1"}}
|
| 291 | output_path.write_text(json.dumps(initial_data))
|
| 292 |
|
| 293 | # Try to remove nonexistent instance
|
| 294 | remove_from_preds_file(output_path, "nonexistent")
|
| 295 |
|
| 296 | # File should be unchanged
|
| 297 | result = json.loads(output_path.read_text())
|
| 298 | assert result == initial_data
|
| 299 |
|
| 300 |
|
| 301 | def test_remove_from_preds_file_no_file(tmp_path):
|
| 302 | """Test remove_from_preds_file when file doesn't exist"""
|
| 303 | output_path = tmp_path / "preds.json"
|
| 304 |
|
| 305 | # Should not raise an error
|
| 306 | remove_from_preds_file(output_path, "any_instance")
|
| 307 |
|
| 308 | # File should still not exist
|
| 309 | assert not output_path.exists()
|
| 310 |
|
| 311 |
|
| 312 | @pytest.mark.slow
|
| 313 | def test_redo_existing_false_skips_existing(github_test_data, tmp_path):
|
| 314 | """Test that redo_existing=False skips instances that already have results"""
|
| 315 | model_responses = github_test_data["model_responses"]
|
| 316 |
|
| 317 | # Create existing preds.json with one instance
|
| 318 | preds_file = tmp_path / "preds.json"
|
| 319 | existing_data = {
|
| 320 | "swe-agent__test-repo-1": {
|
| 321 | "model_name_or_path": "previous_model",
|
| 322 | "instance_id": "swe-agent__test-repo-1",
|
| 323 | "model_patch": "previous_result",
|
| 324 | }
|
| 325 | }
|
| 326 | preds_file.write_text(json.dumps(existing_data))
|
| 327 |
|
| 328 | with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
|
| 329 | mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses)
|
| 330 |
|
| 331 | main(
|
| 332 | subset="_test",
|
| 333 | split="test",
|
| 334 | slice_spec="0:1",
|
| 335 | output=str(tmp_path),
|
| 336 | workers=1,
|
| 337 | filter_spec="swe-agent__test-repo-1",
|
| 338 | redo_existing=False,
|
| 339 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 340 | )
|
| 341 |
|
| 342 | # Should still have the original result
|
| 343 | result = json.loads(preds_file.read_text())
|
| 344 | assert result == existing_data
|
| 345 |
|
| 346 |
|
| 347 | @pytest.mark.slow
|
| 348 | def test_redo_existing_true_overwrites_existing(github_test_data, tmp_path):
|
| 349 | """Test that redo_existing=True processes instances even if they already have results"""
|
| 350 | model_responses = github_test_data["model_responses"]
|
| 351 |
|
| 352 | # Create existing preds.json with one instance
|
| 353 | preds_file = tmp_path / "preds.json"
|
| 354 | existing_data = {
|
| 355 | "swe-agent__test-repo-1": {
|
| 356 | "model_name_or_path": "previous_model",
|
| 357 | "instance_id": "swe-agent__test-repo-1",
|
| 358 | "model_patch": "previous_result",
|
| 359 | }
|
| 360 | }
|
| 361 | preds_file.write_text(json.dumps(existing_data))
|
| 362 |
|
| 363 | with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
|
| 364 | mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses, cost_per_call=0.1)
|
| 365 |
|
| 366 | main(
|
| 367 | subset="_test",
|
| 368 | split="test",
|
| 369 | slice_spec="0:1",
|
| 370 | output=str(tmp_path),
|
| 371 | workers=1,
|
| 372 | filter_spec="swe-agent__test-repo-1",
|
| 373 | redo_existing=True,
|
| 374 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 375 | environment_class="docker",
|
| 376 | )
|
| 377 |
|
| 378 | # Should have new result from deterministic model
|
| 379 | traj_file_path = package_dir.parent.parent / "tests" / "test_data" / "github_issue.traj.json"
|
| 380 | trajectory = json.loads(traj_file_path.read_text())
|
| 381 | expected_result = trajectory[-1]["content"]
|
| 382 |
|
| 383 | result = json.loads(preds_file.read_text())
|
| 384 | assert result["swe-agent__test-repo-1"]["model_patch"] == expected_result
|
| 385 | assert result["swe-agent__test-repo-1"]["model_name_or_path"] == "deterministic"
|
| 386 |
|
| 387 |
|
| 388 | class ExceptionModelConfig(BaseModel):
|
| 389 | model_name: str = "exception_model"
|
| 390 |
|
| 391 |
|
| 392 | class ExceptionModel:
|
| 393 | """Test model that raises exceptions during processing."""
|
| 394 |
|
| 395 | def __init__(self, exception_type: type[Exception] = RuntimeError, exception_message: str = "Test exception"):
|
| 396 | self.exception_type = exception_type
|
| 397 | self.exception_message = exception_message
|
| 398 | self.cost = 0.0
|
| 399 | self.n_calls = 0
|
| 400 | self.config = ExceptionModelConfig()
|
| 401 |
|
| 402 | def query(self, *args, **kwargs):
|
| 403 | self.n_calls += 1
|
| 404 | raise self.exception_type(self.exception_message)
|
| 405 |
|
| 406 | def format_message(self, **kwargs) -> dict:
|
| 407 | return dict(**kwargs)
|
| 408 |
|
| 409 | def format_observation_messages(
|
| 410 | self, message: dict, outputs: list[dict], template_vars: dict | None = None
|
| 411 | ) -> list[dict]:
|
| 412 | return [self.format_message(role="user", content=str(o)) for o in outputs]
|
| 413 |
|
| 414 | def get_template_vars(self, **kwargs) -> dict:
|
| 415 | return self.config.model_dump() | {"n_model_calls": self.n_calls, "model_cost": self.cost}
|
| 416 |
|
| 417 | def serialize(self) -> dict:
|
| 418 | return {
|
| 419 | "info": {
|
| 420 | "model_stats": {
|
| 421 | "instance_cost": self.cost,
|
| 422 | "api_calls": self.n_calls,
|
| 423 | },
|
| 424 | "config": {
|
| 425 | "model": self.config.model_dump(mode="json"),
|
| 426 | "model_type": f"{self.__class__.__module__}.{self.__class__.__name__}",
|
| 427 | },
|
| 428 | }
|
| 429 | }
|
| 430 |
|
| 431 |
|
| 432 | @pytest.mark.slow
|
| 433 | @pytest.mark.parametrize("workers", [1, 2])
|
| 434 | def test_exception_handling_in_agent_run(tmp_path, workers):
|
| 435 | """Test that exceptions during agent.run() are properly handled and recorded"""
|
| 436 | with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
|
| 437 | mock_get_model.return_value = ExceptionModel(RuntimeError, "Agent processing failed")
|
| 438 |
|
| 439 | with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class:
|
| 440 | mock_progress_manager = mock_progress_class.return_value
|
| 441 | mock_progress_manager.render_group = None # For Live context manager
|
| 442 |
|
| 443 | main(
|
| 444 | subset="_test",
|
| 445 | split="test",
|
| 446 | slice_spec="0:1",
|
| 447 | output=str(tmp_path),
|
| 448 | workers=workers,
|
| 449 | filter_spec="swe-agent__test-repo-1",
|
| 450 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 451 | environment_class="docker",
|
| 452 | )
|
| 453 |
|
| 454 | # Check that prediction file contains exception information
|
| 455 | preds_file = tmp_path / "preds.json"
|
| 456 | assert preds_file.exists()
|
| 457 |
|
| 458 | result = json.loads(preds_file.read_text())
|
| 459 | instance_id = "swe-agent__test-repo-1"
|
| 460 | assert instance_id in result
|
| 461 | assert result[instance_id]["model_patch"] == ""
|
| 462 | assert result[instance_id]["model_name_or_path"] == "exception_model"
|
| 463 |
|
| 464 | # Check that trajectory file contains exception information
|
| 465 | traj_file = tmp_path / instance_id / f"{instance_id}.traj.json"
|
| 466 | assert traj_file.exists()
|
| 467 |
|
| 468 | traj_data = json.loads(traj_file.read_text())
|
| 469 | assert traj_data["instance_id"] == instance_id
|
| 470 | assert traj_data["info"]["exit_status"] == "RuntimeError"
|
| 471 | assert traj_data["info"]["submission"] == ""
|
| 472 | assert traj_data["info"]["exception_str"] == "Agent processing failed"
|
| 473 |
|
| 474 |
|
| 475 | @pytest.mark.slow
|
| 476 | @pytest.mark.parametrize("workers", [1, 2])
|
| 477 | def test_different_exception_types(tmp_path, workers):
|
| 478 | """Test that different exception types are properly recorded"""
|
| 479 | with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
|
| 480 | mock_get_model.return_value = ExceptionModel(ValueError, "Invalid input provided")
|
| 481 |
|
| 482 | with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class:
|
| 483 | mock_progress_manager = mock_progress_class.return_value
|
| 484 | mock_progress_manager.render_group = None # For Live context manager
|
| 485 |
|
| 486 | main(
|
| 487 | subset="_test",
|
| 488 | split="test",
|
| 489 | slice_spec="0:1",
|
| 490 | output=str(tmp_path),
|
| 491 | workers=workers,
|
| 492 | filter_spec="swe-agent__test-repo-1",
|
| 493 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 494 | environment_class="docker",
|
| 495 | )
|
| 496 |
|
| 497 | # Check trajectory file for correct exception type
|
| 498 | instance_id = "swe-agent__test-repo-1"
|
| 499 | traj_file = tmp_path / instance_id / f"{instance_id}.traj.json"
|
| 500 | traj_data = json.loads(traj_file.read_text())
|
| 501 |
|
| 502 | assert traj_data["info"]["exit_status"] == "ValueError"
|
| 503 | assert traj_data["info"]["submission"] == ""
|
| 504 | assert traj_data["info"]["exception_str"] == "Invalid input provided"
|
| 505 |
|
| 506 |
|
| 507 | @pytest.mark.slow
|
| 508 | def test_exception_handling_with_progress_manager(tmp_path):
|
| 509 | """Test that progress manager receives exception notifications in multithreaded mode"""
|
| 510 | with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
|
| 511 | mock_get_model.return_value = ExceptionModel(ConnectionError, "Network timeout")
|
| 512 |
|
| 513 | with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class:
|
| 514 | mock_progress_manager = mock_progress_class.return_value
|
| 515 | mock_progress_manager.render_group = None # For Live context manager
|
| 516 |
|
| 517 | main(
|
| 518 | subset="_test",
|
| 519 | split="test",
|
| 520 | slice_spec="0:1",
|
| 521 | output=str(tmp_path),
|
| 522 | workers=2, # Use multithreaded to test progress manager
|
| 523 | filter_spec="swe-agent__test-repo-1",
|
| 524 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 525 | environment_class="docker",
|
| 526 | )
|
| 527 |
|
| 528 | # Verify progress manager methods were called
|
| 529 | mock_progress_manager.on_instance_start.assert_called_once_with("swe-agent__test-repo-1")
|
| 530 | mock_progress_manager.on_instance_end.assert_called_once_with("swe-agent__test-repo-1", "ConnectionError")
|
| 531 |
|
| 532 | # on_uncaught_exception should not be called since exceptions are handled properly
|
| 533 | mock_progress_manager.on_uncaught_exception.assert_not_called()
|
| 534 |
|