Spaces:
Running
Running
Final strict spec-compliance polish: score precision, empty rewards, updated test assertions
6284048 | # tests/test_grader_variance.py | |
| # Phase 2 of judging runs a variance check. If all graders return the same score | |
| # for different quality answers, the submission is DISQUALIFIED. | |
| # Run: python -m pytest tests/test_grader_variance.py -v | |
| import sys | |
| sys.path.insert(0, '.') | |
| from server.graders.base_grader import safe_score | |
| from server.graders.security_grader import compute_correctness as sec_cc | |
| from server.graders.dependency_grader import compute_correctness as dep_cc | |
| from server.graders.clinical_grader import compute_correctness as cli_cc | |
| # ββ Security Case for Testing ββ | |
| SEC_CASE = { | |
| 'expected_vuln_type': 'sql_injection', | |
| 'cvss_range': [7.5, 9.8], | |
| 'expected_severity': 'critical', | |
| 'required_fix_tokens': ['?', 'parameterized'], | |
| 'current_feedback_keywords': ['sql', 'injection'], | |
| 'original_vuln_pattern': 'query+', | |
| } | |
| def test_sec_identify_variance(): | |
| """Security grader must return 3+ different scores for different quality answers.""" | |
| perfect = { | |
| 'action_type': 'identify_vulnerability', | |
| 'vuln_type': 'sql_injection', | |
| 'cvss_score': 8.5, | |
| 'severity': 'critical', | |
| 'affected_line': 1, | |
| } | |
| partial = { | |
| 'action_type': 'identify_vulnerability', | |
| 'vuln_type': 'xss', # wrong vuln_type | |
| 'cvss_score': 8.5, # but correct CVSS | |
| 'severity': 'critical', # and correct severity | |
| 'affected_line': 1, | |
| } | |
| wrong = { | |
| 'action_type': 'identify_vulnerability', | |
| 'vuln_type': 'xss', # wrong everything | |
| 'cvss_score': 2.0, | |
| 'severity': 'low', | |
| 'affected_line': 1, | |
| } | |
| s1 = safe_score(sec_cc(perfect, SEC_CASE)) | |
| s2 = safe_score(sec_cc(partial, SEC_CASE)) | |
| s3 = safe_score(sec_cc(wrong, SEC_CASE)) | |
| assert len({round(s, 2) for s in [s1, s2, s3]}) >= 3, f'No variance: {s1},{s2},{s3}' | |
| assert s1 > s2 > s3, f'Wrong ordering: {s1},{s2},{s3}' | |
| print(f' Security identify variance: {s1:.4f} > {s2:.4f} > {s3:.4f} PASS') | |
| def test_dep_resolve_variance(): | |
| """Dependency grader must return different scores for different quality answers.""" | |
| case = { | |
| 'conflict_packages': ['torch', 'numpy'], | |
| 'compatibility_matrix': { | |
| 'torch': {'2.1.0': {'numpy': '>=1.24'}, '1.9.0': {}}, | |
| 'numpy': {'1.24.0': {}, '1.16.0': {}}, | |
| }, | |
| 'requirements': {'torch': '1.9.0', 'numpy': '1.16.0'}, | |
| } | |
| full = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.24.0'}, 'reasoning': 'ok'} | |
| part = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.16.0'}, 'reasoning': 'ok'} | |
| empty = {'action_type': 'resolve_conflict', 'packages': {}, 'reasoning': 'ok'} | |
| s1 = safe_score(dep_cc(full, case)) | |
| s2 = safe_score(dep_cc(part, case)) | |
| s3 = safe_score(dep_cc(empty, case)) | |
| assert s1 > s2 >= s3, f'No variance: {s1},{s2},{s3}' | |
| print(f' Dependency resolve variance: {s1:.4f} > {s2:.4f} >= {s3:.4f} PASS') | |
| def test_cli_order_variance(): | |
| """Clinical grader must return different scores for correct vs violated dependency order.""" | |
| case = { | |
| 'dependency_graph': { | |
| 'schedule_surgery': ['resolve_insurance', 'complete_pre_op'], | |
| 'complete_pre_op': ['resolve_insurance'], | |
| 'resolve_insurance': [], | |
| }, | |
| 'required_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'], | |
| } | |
| correct = { | |
| 'action_type': 'order_steps', | |
| 'recovery_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'], | |
| } | |
| violated = { | |
| 'action_type': 'order_steps', | |
| 'recovery_steps': ['schedule_surgery', 'complete_pre_op', 'resolve_insurance'], | |
| } | |
| partial = { | |
| 'action_type': 'order_steps', | |
| 'recovery_steps': ['resolve_insurance', 'complete_pre_op'], | |
| } | |
| s1 = safe_score(cli_cc(correct, case)) | |
| s2 = safe_score(cli_cc(violated, case)) | |
| s3 = safe_score(cli_cc(partial, case)) | |
| assert s1 > s2, f'Violation not penalised: correct={s1}, violated={s2}' | |
| assert s1 > s3, f'Completeness not rewarded: correct={s1}, partial={s3}' | |
| print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS') | |
| def test_safe_score_clamp(): | |
| """ | |
| safe_score clamps to [0.01, 0.99] β strictly between 0 and 1. | |
| WHY 0.01 not 0.0: The official spec says scores must be strictly > 0. | |
| A score of 0.0 from a crashed run looks indistinguishable | |
| from a broken environment. 0.01 signals "ran but failed". | |
| WHY 0.99 not 1.0: A score of exactly 1.0 means the grader is trivially solved | |
| or broken. 0.99 signals "excellent but not perfect". | |
| """ | |
| # Floor: None, negative, bad types β 0.01 | |
| assert safe_score(None) == 0.01, f"Expected 0.01, got {safe_score(None)}" | |
| assert safe_score(-0.5) == 0.01, f"Expected 0.01, got {safe_score(-0.5)}" | |
| assert safe_score(-999) == 0.01, f"Expected 0.01, got {safe_score(-999)}" | |
| assert safe_score('bad') == 0.01, f"Expected 0.01, got {safe_score('bad')}" | |
| assert safe_score([]) == 0.01, f"Expected 0.01, got {safe_score([])}" | |
| # Ceiling: values > 1 β 0.99 | |
| assert safe_score(1.5) == 0.99, f"Expected 0.99, got {safe_score(1.5)}" | |
| assert safe_score(2.0) == 0.99, f"Expected 0.99, got {safe_score(2.0)}" | |
| assert safe_score(100) == 0.99, f"Expected 0.99, got {safe_score(100)}" | |
| # Exact boundary values | |
| assert safe_score(0.01) == 0.01, f"Expected 0.01, got {safe_score(0.01)}" | |
| assert safe_score(0.99) == 0.99, f"Expected 0.99, got {safe_score(0.99)}" | |
| # Pass-through: normal values in range stay unchanged | |
| assert safe_score(0.5) == 0.5, f"Expected 0.5, got {safe_score(0.5)}" | |
| assert safe_score(0.85) == 0.85, f"Expected 0.85, got {safe_score(0.85)}" | |
| assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}" | |
| assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}" | |
| print(' safe_score clamp [0.01, 0.99]: PASS') | |
| def test_clinical_valid_actions(): | |
| """Bug 2 fix: propose_recovery must NOT be in clinical VALID_ACTIONS.""" | |
| from server.graders.clinical_grader import VALID_ACTIONS | |
| assert 'propose_recovery' not in VALID_ACTIONS, 'Bug 2 still present!' | |
| assert set(VALID_ACTIONS) == {'detect_gap', 'rank_issues', 'order_steps'} | |
| print(' Clinical VALID_ACTIONS (Bug 2): PASS') | |
| if __name__ == '__main__': | |
| test_safe_score_clamp() | |
| test_clinical_valid_actions() | |
| test_sec_identify_variance() | |
| test_dep_resolve_variance() | |
| test_cli_order_variance() | |
| print('\nALL VARIANCE TESTS PASSED β ') | |