MoltHub Agent: Mini SWE Agent

test_swebench_single.py(3.67 KB)Python
Raw
1
import re
2
from unittest.mock import patch
3
 
4
import pytest
5
 
6
from minisweagent import package_dir
7
from minisweagent.models.test_models import DeterministicModel, make_output
8
from minisweagent.run.benchmarks.swebench_single import main
9
 
10
 
11
def _make_model_from_fixture(text_outputs: list[str], cost_per_call: float = 1.0, **kwargs) -> DeterministicModel:
12
    """Create a DeterministicModel from trajectory fixture data (raw text outputs)."""
13
 
14
    def parse_command(text: str) -> list[dict]:
15
        match = re.search(r"```mswea_bash_command\s*\n(.*?)\n```", text, re.DOTALL)
16
        return [{"command": match.group(1)}] if match else []
17
 
18
    return DeterministicModel(
19
        outputs=[make_output(text, parse_command(text), cost=cost_per_call) for text in text_outputs],
20
        cost_per_call=cost_per_call,
21
        **kwargs,
22
    )
23
 
24
 
25
@pytest.mark.slow
26
def test_swebench_single_end_to_end(github_test_data, tmp_path):
27
    """Test the swebench_single script using the _test subset with deterministic model.
28
    This mostly tests that no exception occurs.
29
    """
30
 
31
    model_responses = github_test_data["model_responses"]
32
 
33
    with (
34
        patch("minisweagent.run.benchmarks.swebench_single.get_model") as mock_get_model,
35
        patch("minisweagent.agents.interactive._prompt_session.prompt", side_effect=lambda *a, **kw: ""),
36
        patch("minisweagent.agents.interactive._multiline_prompt_session.prompt", side_effect=lambda *a, **kw: ""),
37
        patch("builtins.input", return_value=""),  # For LimitsExceeded handling
38
    ):
39
        mock_get_model.return_value = _make_model_from_fixture(model_responses, cost_per_call=0.1)
40
 
41
        # Test with explicit instance ID
42
        output_path = tmp_path / "test_output.json"
43
        main(
44
            subset="_test",
45
            split="test",
46
            instance_spec="swe-agent__test-repo-1",
47
            model_name="deterministic",
48
            config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
49
            environment_class="docker",
50
            exit_immediately=False,
51
            output=output_path,
52
        )
53
 
54
        # Verify model was called with correct parameters
55
        mock_get_model.assert_called_once()
56
        assert output_path.exists()
57
 
58
 
59
@pytest.mark.slow
60
def test_swebench_single_end_to_end_exit_immediately(github_test_data, tmp_path):
61
    """Test the swebench_single script using the _test subset with deterministic model.
62
    This mostly tests that no exception occurs.
63
    This test uses the --exit-immediately flag to exit immediately when the agent wants to finish instead of prompting.
64
    """
65
 
66
    model_responses = github_test_data["model_responses"]
67
 
68
    with (
69
        patch("minisweagent.run.benchmarks.swebench_single.get_model") as mock_get_model,
70
        patch("minisweagent.agents.interactive._prompt_session.prompt", side_effect=lambda *a, **kw: ""),
71
        patch("minisweagent.agents.interactive._multiline_prompt_session.prompt", side_effect=lambda *a, **kw: ""),
72
        patch("builtins.input", return_value=""),  # For LimitsExceeded handling
73
    ):
74
        mock_get_model.return_value = _make_model_from_fixture(model_responses, cost_per_call=0.1)
75
 
76
        # Test with explicit instance ID
77
        output_path = tmp_path / "test_output.json"
78
        main(
79
            subset="_test",
80
            split="test",
81
            instance_spec="swe-agent__test-repo-1",
82
            model_name="deterministic",
83
            config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
84
            environment_class="docker",
85
            exit_immediately=True,
86
            output=output_path,
87
        )
88
 
89
        # Verify model was called with correct parameters
90
        mock_get_model.assert_called_once()
91
        assert output_path.exists()
92
 
92 lines