| 1 | import re
|
| 2 | from unittest.mock import patch
|
| 3 |
|
| 4 | import pytest
|
| 5 |
|
| 6 | from minisweagent import package_dir
|
| 7 | from minisweagent.models.test_models import DeterministicModel, make_output
|
| 8 | from minisweagent.run.benchmarks.swebench_single import main
|
| 9 |
|
| 10 |
|
| 11 | def _make_model_from_fixture(text_outputs: list[str], cost_per_call: float = 1.0, **kwargs) -> DeterministicModel:
|
| 12 | """Create a DeterministicModel from trajectory fixture data (raw text outputs)."""
|
| 13 |
|
| 14 | def parse_command(text: str) -> list[dict]:
|
| 15 | match = re.search(r"```mswea_bash_command\s*\n(.*?)\n```", text, re.DOTALL)
|
| 16 | return [{"command": match.group(1)}] if match else []
|
| 17 |
|
| 18 | return DeterministicModel(
|
| 19 | outputs=[make_output(text, parse_command(text), cost=cost_per_call) for text in text_outputs],
|
| 20 | cost_per_call=cost_per_call,
|
| 21 | **kwargs,
|
| 22 | )
|
| 23 |
|
| 24 |
|
| 25 | @pytest.mark.slow
|
| 26 | def test_swebench_single_end_to_end(github_test_data, tmp_path):
|
| 27 | """Test the swebench_single script using the _test subset with deterministic model.
|
| 28 | This mostly tests that no exception occurs.
|
| 29 | """
|
| 30 |
|
| 31 | model_responses = github_test_data["model_responses"]
|
| 32 |
|
| 33 | with (
|
| 34 | patch("minisweagent.run.benchmarks.swebench_single.get_model") as mock_get_model,
|
| 35 | patch("minisweagent.agents.interactive._prompt_session.prompt", side_effect=lambda *a, **kw: ""),
|
| 36 | patch("minisweagent.agents.interactive._multiline_prompt_session.prompt", side_effect=lambda *a, **kw: ""),
|
| 37 | patch("builtins.input", return_value=""), # For LimitsExceeded handling
|
| 38 | ):
|
| 39 | mock_get_model.return_value = _make_model_from_fixture(model_responses, cost_per_call=0.1)
|
| 40 |
|
| 41 | # Test with explicit instance ID
|
| 42 | output_path = tmp_path / "test_output.json"
|
| 43 | main(
|
| 44 | subset="_test",
|
| 45 | split="test",
|
| 46 | instance_spec="swe-agent__test-repo-1",
|
| 47 | model_name="deterministic",
|
| 48 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 49 | environment_class="docker",
|
| 50 | exit_immediately=False,
|
| 51 | output=output_path,
|
| 52 | )
|
| 53 |
|
| 54 | # Verify model was called with correct parameters
|
| 55 | mock_get_model.assert_called_once()
|
| 56 | assert output_path.exists()
|
| 57 |
|
| 58 |
|
| 59 | @pytest.mark.slow
|
| 60 | def test_swebench_single_end_to_end_exit_immediately(github_test_data, tmp_path):
|
| 61 | """Test the swebench_single script using the _test subset with deterministic model.
|
| 62 | This mostly tests that no exception occurs.
|
| 63 | This test uses the --exit-immediately flag to exit immediately when the agent wants to finish instead of prompting.
|
| 64 | """
|
| 65 |
|
| 66 | model_responses = github_test_data["model_responses"]
|
| 67 |
|
| 68 | with (
|
| 69 | patch("minisweagent.run.benchmarks.swebench_single.get_model") as mock_get_model,
|
| 70 | patch("minisweagent.agents.interactive._prompt_session.prompt", side_effect=lambda *a, **kw: ""),
|
| 71 | patch("minisweagent.agents.interactive._multiline_prompt_session.prompt", side_effect=lambda *a, **kw: ""),
|
| 72 | patch("builtins.input", return_value=""), # For LimitsExceeded handling
|
| 73 | ):
|
| 74 | mock_get_model.return_value = _make_model_from_fixture(model_responses, cost_per_call=0.1)
|
| 75 |
|
| 76 | # Test with explicit instance ID
|
| 77 | output_path = tmp_path / "test_output.json"
|
| 78 | main(
|
| 79 | subset="_test",
|
| 80 | split="test",
|
| 81 | instance_spec="swe-agent__test-repo-1",
|
| 82 | model_name="deterministic",
|
| 83 | config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")],
|
| 84 | environment_class="docker",
|
| 85 | exit_immediately=True,
|
| 86 | output=output_path,
|
| 87 | )
|
| 88 |
|
| 89 | # Verify model was called with correct parameters
|
| 90 | mock_get_model.assert_called_once()
|
| 91 | assert output_path.exists()
|
| 92 |
|