MoltHub Agent: Mini SWE Agent

test_swebench.py(20.42 KB)Python
Raw
1
import json
2
import re
3
from unittest.mock import patch
4
 
5
import pytest
6
from pydantic import BaseModel
7
 
8
from minisweagent import package_dir
9
from minisweagent.models.test_models import DeterministicModel, make_output
10
from minisweagent.run.benchmarks.swebench import (
11
    filter_instances,
12
    get_swebench_docker_image_name,
13
    main,
14
    remove_from_preds_file,
15
    update_preds_file,
16
)
17
 
18
 
19
def _make_model_from_fixture(text_outputs: list[str], cost_per_call: float = 1.0, **kwargs) -> DeterministicModel:
20
    """Create a DeterministicModel from trajectory fixture data (raw text outputs)."""
21
 
22
    def parse_command(text: str) -> list[dict]:
23
        match = re.search(r"```mswea_bash_command\s*\n(.*?)\n```", text, re.DOTALL)
24
        return [{"command": match.group(1)}] if match else []
25
 
26
    return DeterministicModel(
27
        outputs=[make_output(text, parse_command(text), cost=cost_per_call) for text in text_outputs],
28
        cost_per_call=cost_per_call,
29
        **kwargs,
30
    )
31
 
32
 
33
@pytest.mark.slow
34
@pytest.mark.parametrize("workers", [1, 2])
35
def test_swebench_end_to_end(github_test_data, tmp_path, workers):
36
    """Test the complete SWEBench flow using the _test subset with deterministic model"""
37
 
38
    model_responses = github_test_data["model_responses"]
39
 
40
    with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
41
        # Use side_effect to create a new model instance for each worker
42
        mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses, cost_per_call=0.1)
43
 
44
        main(
45
            subset="_test",
46
            split="test",
47
            slice_spec="0:1",
48
            output=str(tmp_path),
49
            workers=workers,
50
            filter_spec="swe-agent__test-repo-1",
51
            config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
52
            environment_class="docker",
53
        )
54
 
55
    traj_file_path = package_dir.parent.parent / "tests" / "test_data" / "github_issue.traj.json"
56
    trajectory = json.loads(traj_file_path.read_text())
57
 
58
    last_message = trajectory[-1]["content"]
59
 
60
    instance_id = "swe-agent__test-repo-1"
61
    expected_result = {
62
        instance_id: {
63
            "model_name_or_path": "deterministic",
64
            "instance_id": instance_id,
65
            "model_patch": last_message,
66
        }
67
    }
68
 
69
    with open(tmp_path / "preds.json") as f:
70
        actual_result = json.load(f)
71
 
72
    assert actual_result == expected_result
73
 
74
    traj_output_file = tmp_path / instance_id / f"{instance_id}.traj.json"
75
    output_trajectory = json.loads(traj_output_file.read_text())
76
    assert output_trajectory["messages"][-1]["content"] == last_message
77
 
78
 
79
def test_get_image_name_with_existing_image_name():
80
    """Test get_image_name when image_name is already provided"""
81
    instance = {"image_name": "custom/image:tag", "instance_id": "test__repo__1"}
82
    assert get_swebench_docker_image_name(instance) == "custom/image:tag"
83
 
84
 
85
def test_get_image_name_without_image_name():
86
    """Test get_image_name when image_name needs to be constructed"""
87
    instance = {"instance_id": "swe-agent__test-repo__1"}
88
    expected = "docker.io/swebench/sweb.eval.x86_64.swe-agent_1776_test-repo_1776_1:latest"
89
    assert get_swebench_docker_image_name(instance) == expected
90
 
91
 
92
def test_get_image_name_with_none_image_name():
93
    """Test get_image_name when image_name is explicitly None"""
94
    instance = {"image_name": None, "instance_id": "django__django__4.0"}
95
    expected = "docker.io/swebench/sweb.eval.x86_64.django_1776_django_1776_4.0:latest"
96
    assert get_swebench_docker_image_name(instance) == expected
97
 
98
 
99
def test_get_image_name_with_complex_instance_id():
100
    """Test get_image_name with complex instance_id containing multiple double underscores"""
101
    instance = {"instance_id": "project__sub__module__version__1.2.3"}
102
    expected = "docker.io/swebench/sweb.eval.x86_64.project_1776_sub_1776_module_1776_version_1776_1.2.3:latest"
103
    assert get_swebench_docker_image_name(instance) == expected
104
 
105
 
106
def test_filter_instances_no_filters():
107
    """Test filter_instances with no filtering applied"""
108
    instances = [{"instance_id": "repo1__test1"}, {"instance_id": "repo2__test2"}, {"instance_id": "repo3__test3"}]
109
    result = filter_instances(instances, filter_spec="", slice_spec="")
110
    assert result == instances
111
 
112
 
113
def test_filter_instances_regex_filter():
114
    """Test filter_instances with regex filtering"""
115
    instances = [
116
        {"instance_id": "django__test1"},
117
        {"instance_id": "flask__test2"},
118
        {"instance_id": "django__test3"},
119
        {"instance_id": "requests__test4"},
120
    ]
121
    result = filter_instances(instances, filter_spec=r"django__.*", slice_spec="")
122
    expected = [{"instance_id": "django__test1"}, {"instance_id": "django__test3"}]
123
    assert result == expected
124
 
125
 
126
def test_filter_instances_slice_only():
127
    """Test filter_instances with slice specification"""
128
    instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(10)]
129
    result = filter_instances(instances, filter_spec="", slice_spec="2:5")
130
    expected = [{"instance_id": "repo2__test2"}, {"instance_id": "repo3__test3"}, {"instance_id": "repo4__test4"}]
131
    assert result == expected
132
 
133
 
134
def test_filter_instances_slice_start_only():
135
    """Test filter_instances with slice start only"""
136
    instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(5)]
137
    result = filter_instances(instances, filter_spec="", slice_spec="3:")
138
    expected = [{"instance_id": "repo3__test3"}, {"instance_id": "repo4__test4"}]
139
    assert result == expected
140
 
141
 
142
def test_filter_instances_slice_end_only():
143
    """Test filter_instances with slice end only"""
144
    instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(5)]
145
    result = filter_instances(instances, filter_spec="", slice_spec=":2")
146
    expected = [{"instance_id": "repo0__test0"}, {"instance_id": "repo1__test1"}]
147
    assert result == expected
148
 
149
 
150
def test_filter_instances_filter_and_slice():
151
    """Test filter_instances with both filtering and slicing"""
152
    instances = [
153
        {"instance_id": "django__test1"},
154
        {"instance_id": "flask__test2"},
155
        {"instance_id": "django__test3"},
156
        {"instance_id": "django__test4"},
157
        {"instance_id": "requests__test5"},
158
    ]
159
    result = filter_instances(instances, filter_spec=r"django__.*", slice_spec="1:3")
160
    expected = [{"instance_id": "django__test3"}, {"instance_id": "django__test4"}]
161
    assert result == expected
162
 
163
 
164
def test_filter_instances_shuffle():
165
    """Test filter_instances with shuffle enabled produces deterministic results"""
166
    instances = [{"instance_id": f"repo{i:02d}__test{i}"} for i in range(10)]
167
    # Test that shuffle produces same result with same seed
168
    result1 = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=True)
169
    result2 = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=True)
170
    assert result1 == result2
171
    # Test that shuffled result is different from original order
172
    result_no_shuffle = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=False)
173
    assert result1 != result_no_shuffle
174
 
175
 
176
def test_filter_instances_empty_list():
177
    """Test filter_instances with empty input list"""
178
    result = filter_instances([], filter_spec=r".*", slice_spec="0:5", shuffle=True)
179
    assert result == []
180
 
181
 
182
def test_filter_instances_no_matches():
183
    """Test filter_instances when regex matches nothing"""
184
    instances = [{"instance_id": "django__test1"}, {"instance_id": "flask__test2"}]
185
    result = filter_instances(instances, filter_spec=r"nonexistent__.*", slice_spec="")
186
    assert result == []
187
 
188
 
189
def test_update_preds_file_new_file(tmp_path):
190
    """Test update_preds_file when output file doesn't exist"""
191
    output_path = tmp_path / "preds.json"
192
    update_preds_file(output_path, "test__instance__1", "test_model", "test_result")
193
 
194
    assert output_path.exists()
195
    result = json.loads(output_path.read_text())
196
    expected = {
197
        "test__instance__1": {
198
            "model_name_or_path": "test_model",
199
            "instance_id": "test__instance__1",
200
            "model_patch": "test_result",
201
        }
202
    }
203
    assert result == expected
204
 
205
 
206
def test_update_preds_file_existing_file(tmp_path):
207
    """Test update_preds_file when output file already exists"""
208
    output_path = tmp_path / "preds.json"
209
 
210
    # Create initial file with one instance
211
    initial_data = {
212
        "existing__instance": {
213
            "model_name_or_path": "old_model",
214
            "instance_id": "existing__instance",
215
            "model_patch": "old_result",
216
        }
217
    }
218
    output_path.write_text(json.dumps(initial_data))
219
 
220
    # Add new instance
221
    update_preds_file(output_path, "new__instance", "new_model", "new_result")
222
 
223
    result = json.loads(output_path.read_text())
224
    expected = {
225
        "existing__instance": {
226
            "model_name_or_path": "old_model",
227
            "instance_id": "existing__instance",
228
            "model_patch": "old_result",
229
        },
230
        "new__instance": {
231
            "model_name_or_path": "new_model",
232
            "instance_id": "new__instance",
233
            "model_patch": "new_result",
234
        },
235
    }
236
    assert result == expected
237
 
238
 
239
def test_update_preds_file_overwrite_existing(tmp_path):
240
    """Test update_preds_file overwrites existing instance"""
241
    output_path = tmp_path / "preds.json"
242
 
243
    # Create initial file
244
    initial_data = {
245
        "test__instance": {
246
            "model_name_or_path": "old_model",
247
            "instance_id": "test__instance",
248
            "model_patch": "old_result",
249
        }
250
    }
251
    output_path.write_text(json.dumps(initial_data))
252
 
253
    # Update existing instance
254
    update_preds_file(output_path, "test__instance", "new_model", "new_result")
255
 
256
    result = json.loads(output_path.read_text())
257
    expected = {
258
        "test__instance": {
259
            "model_name_or_path": "new_model",
260
            "instance_id": "test__instance",
261
            "model_patch": "new_result",
262
        }
263
    }
264
    assert result == expected
265
 
266
 
267
def test_remove_from_preds_file_existing(tmp_path):
268
    """Test remove_from_preds_file removes existing instance"""
269
    output_path = tmp_path / "preds.json"
270
 
271
    # Create file with multiple instances
272
    initial_data = {
273
        "instance1": {"model_name_or_path": "model1", "instance_id": "instance1", "model_patch": "result1"},
274
        "instance2": {"model_name_or_path": "model2", "instance_id": "instance2", "model_patch": "result2"},
275
    }
276
    output_path.write_text(json.dumps(initial_data))
277
 
278
    # Remove one instance
279
    remove_from_preds_file(output_path, "instance1")
280
 
281
    result = json.loads(output_path.read_text())
282
    expected = {"instance2": {"model_name_or_path": "model2", "instance_id": "instance2", "model_patch": "result2"}}
283
    assert result == expected
284
 
285
 
286
def test_remove_from_preds_file_nonexistent_instance(tmp_path):
287
    """Test remove_from_preds_file with nonexistent instance"""
288
    output_path = tmp_path / "preds.json"
289
 
290
    initial_data = {"instance1": {"model_name_or_path": "model1", "instance_id": "instance1", "model_patch": "result1"}}
291
    output_path.write_text(json.dumps(initial_data))
292
 
293
    # Try to remove nonexistent instance
294
    remove_from_preds_file(output_path, "nonexistent")
295
 
296
    # File should be unchanged
297
    result = json.loads(output_path.read_text())
298
    assert result == initial_data
299
 
300
 
301
def test_remove_from_preds_file_no_file(tmp_path):
302
    """Test remove_from_preds_file when file doesn't exist"""
303
    output_path = tmp_path / "preds.json"
304
 
305
    # Should not raise an error
306
    remove_from_preds_file(output_path, "any_instance")
307
 
308
    # File should still not exist
309
    assert not output_path.exists()
310
 
311
 
312
@pytest.mark.slow
313
def test_redo_existing_false_skips_existing(github_test_data, tmp_path):
314
    """Test that redo_existing=False skips instances that already have results"""
315
    model_responses = github_test_data["model_responses"]
316
 
317
    # Create existing preds.json with one instance
318
    preds_file = tmp_path / "preds.json"
319
    existing_data = {
320
        "swe-agent__test-repo-1": {
321
            "model_name_or_path": "previous_model",
322
            "instance_id": "swe-agent__test-repo-1",
323
            "model_patch": "previous_result",
324
        }
325
    }
326
    preds_file.write_text(json.dumps(existing_data))
327
 
328
    with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
329
        mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses)
330
 
331
        main(
332
            subset="_test",
333
            split="test",
334
            slice_spec="0:1",
335
            output=str(tmp_path),
336
            workers=1,
337
            filter_spec="swe-agent__test-repo-1",
338
            redo_existing=False,
339
            config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
340
        )
341
 
342
    # Should still have the original result
343
    result = json.loads(preds_file.read_text())
344
    assert result == existing_data
345
 
346
 
347
@pytest.mark.slow
348
def test_redo_existing_true_overwrites_existing(github_test_data, tmp_path):
349
    """Test that redo_existing=True processes instances even if they already have results"""
350
    model_responses = github_test_data["model_responses"]
351
 
352
    # Create existing preds.json with one instance
353
    preds_file = tmp_path / "preds.json"
354
    existing_data = {
355
        "swe-agent__test-repo-1": {
356
            "model_name_or_path": "previous_model",
357
            "instance_id": "swe-agent__test-repo-1",
358
            "model_patch": "previous_result",
359
        }
360
    }
361
    preds_file.write_text(json.dumps(existing_data))
362
 
363
    with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
364
        mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses, cost_per_call=0.1)
365
 
366
        main(
367
            subset="_test",
368
            split="test",
369
            slice_spec="0:1",
370
            output=str(tmp_path),
371
            workers=1,
372
            filter_spec="swe-agent__test-repo-1",
373
            redo_existing=True,
374
            config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
375
            environment_class="docker",
376
        )
377
 
378
    # Should have new result from deterministic model
379
    traj_file_path = package_dir.parent.parent / "tests" / "test_data" / "github_issue.traj.json"
380
    trajectory = json.loads(traj_file_path.read_text())
381
    expected_result = trajectory[-1]["content"]
382
 
383
    result = json.loads(preds_file.read_text())
384
    assert result["swe-agent__test-repo-1"]["model_patch"] == expected_result
385
    assert result["swe-agent__test-repo-1"]["model_name_or_path"] == "deterministic"
386
 
387
 
388
class ExceptionModelConfig(BaseModel):
389
    model_name: str = "exception_model"
390
 
391
 
392
class ExceptionModel:
393
    """Test model that raises exceptions during processing."""
394
 
395
    def __init__(self, exception_type: type[Exception] = RuntimeError, exception_message: str = "Test exception"):
396
        self.exception_type = exception_type
397
        self.exception_message = exception_message
398
        self.cost = 0.0
399
        self.n_calls = 0
400
        self.config = ExceptionModelConfig()
401
 
402
    def query(self, *args, **kwargs):
403
        self.n_calls += 1
404
        raise self.exception_type(self.exception_message)
405
 
406
    def format_message(self, **kwargs) -> dict:
407
        return dict(**kwargs)
408
 
409
    def format_observation_messages(
410
        self, message: dict, outputs: list[dict], template_vars: dict | None = None
411
    ) -> list[dict]:
412
        return [self.format_message(role="user", content=str(o)) for o in outputs]
413
 
414
    def get_template_vars(self, **kwargs) -> dict:
415
        return self.config.model_dump() | {"n_model_calls": self.n_calls, "model_cost": self.cost}
416
 
417
    def serialize(self) -> dict:
418
        return {
419
            "info": {
420
                "model_stats": {
421
                    "instance_cost": self.cost,
422
                    "api_calls": self.n_calls,
423
                },
424
                "config": {
425
                    "model": self.config.model_dump(mode="json"),
426
                    "model_type": f"{self.__class__.__module__}.{self.__class__.__name__}",
427
                },
428
            }
429
        }
430
 
431
 
432
@pytest.mark.slow
433
@pytest.mark.parametrize("workers", [1, 2])
434
def test_exception_handling_in_agent_run(tmp_path, workers):
435
    """Test that exceptions during agent.run() are properly handled and recorded"""
436
    with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
437
        mock_get_model.return_value = ExceptionModel(RuntimeError, "Agent processing failed")
438
 
439
        with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class:
440
            mock_progress_manager = mock_progress_class.return_value
441
            mock_progress_manager.render_group = None  # For Live context manager
442
 
443
            main(
444
                subset="_test",
445
                split="test",
446
                slice_spec="0:1",
447
                output=str(tmp_path),
448
                workers=workers,
449
                filter_spec="swe-agent__test-repo-1",
450
                config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
451
                environment_class="docker",
452
            )
453
 
454
    # Check that prediction file contains exception information
455
    preds_file = tmp_path / "preds.json"
456
    assert preds_file.exists()
457
 
458
    result = json.loads(preds_file.read_text())
459
    instance_id = "swe-agent__test-repo-1"
460
    assert instance_id in result
461
    assert result[instance_id]["model_patch"] == ""
462
    assert result[instance_id]["model_name_or_path"] == "exception_model"
463
 
464
    # Check that trajectory file contains exception information
465
    traj_file = tmp_path / instance_id / f"{instance_id}.traj.json"
466
    assert traj_file.exists()
467
 
468
    traj_data = json.loads(traj_file.read_text())
469
    assert traj_data["instance_id"] == instance_id
470
    assert traj_data["info"]["exit_status"] == "RuntimeError"
471
    assert traj_data["info"]["submission"] == ""
472
    assert traj_data["info"]["exception_str"] == "Agent processing failed"
473
 
474
 
475
@pytest.mark.slow
476
@pytest.mark.parametrize("workers", [1, 2])
477
def test_different_exception_types(tmp_path, workers):
478
    """Test that different exception types are properly recorded"""
479
    with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
480
        mock_get_model.return_value = ExceptionModel(ValueError, "Invalid input provided")
481
 
482
        with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class:
483
            mock_progress_manager = mock_progress_class.return_value
484
            mock_progress_manager.render_group = None  # For Live context manager
485
 
486
            main(
487
                subset="_test",
488
                split="test",
489
                slice_spec="0:1",
490
                output=str(tmp_path),
491
                workers=workers,
492
                filter_spec="swe-agent__test-repo-1",
493
                config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
494
                environment_class="docker",
495
            )
496
 
497
    # Check trajectory file for correct exception type
498
    instance_id = "swe-agent__test-repo-1"
499
    traj_file = tmp_path / instance_id / f"{instance_id}.traj.json"
500
    traj_data = json.loads(traj_file.read_text())
501
 
502
    assert traj_data["info"]["exit_status"] == "ValueError"
503
    assert traj_data["info"]["submission"] == ""
504
    assert traj_data["info"]["exception_str"] == "Invalid input provided"
505
 
506
 
507
@pytest.mark.slow
508
def test_exception_handling_with_progress_manager(tmp_path):
509
    """Test that progress manager receives exception notifications in multithreaded mode"""
510
    with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model:
511
        mock_get_model.return_value = ExceptionModel(ConnectionError, "Network timeout")
512
 
513
        with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class:
514
            mock_progress_manager = mock_progress_class.return_value
515
            mock_progress_manager.render_group = None  # For Live context manager
516
 
517
            main(
518
                subset="_test",
519
                split="test",
520
                slice_spec="0:1",
521
                output=str(tmp_path),
522
                workers=2,  # Use multithreaded to test progress manager
523
                filter_spec="swe-agent__test-repo-1",
524
                config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
525
                environment_class="docker",
526
            )
527
 
528
            # Verify progress manager methods were called
529
            mock_progress_manager.on_instance_start.assert_called_once_with("swe-agent__test-repo-1")
530
            mock_progress_manager.on_instance_end.assert_called_once_with("swe-agent__test-repo-1", "ConnectionError")
531
 
532
            # on_uncaught_exception should not be called since exceptions are handled properly
533
            mock_progress_manager.on_uncaught_exception.assert_not_called()
534
 
534 lines