EntropyEnv / tests /test_grader_variance.py
immortalindeed's picture
Final strict spec-compliance polish: score precision, empty rewards, updated test assertions
6284048
# tests/test_grader_variance.py
# Phase 2 of judging runs a variance check. If all graders return the same score
# for different quality answers, the submission is DISQUALIFIED.
# Run: python -m pytest tests/test_grader_variance.py -v
import sys
sys.path.insert(0, '.')
from server.graders.base_grader import safe_score
from server.graders.security_grader import compute_correctness as sec_cc
from server.graders.dependency_grader import compute_correctness as dep_cc
from server.graders.clinical_grader import compute_correctness as cli_cc
# ── Security Case for Testing ──
SEC_CASE = {
'expected_vuln_type': 'sql_injection',
'cvss_range': [7.5, 9.8],
'expected_severity': 'critical',
'required_fix_tokens': ['?', 'parameterized'],
'current_feedback_keywords': ['sql', 'injection'],
'original_vuln_pattern': 'query+',
}
def test_sec_identify_variance():
"""Security grader must return 3+ different scores for different quality answers."""
perfect = {
'action_type': 'identify_vulnerability',
'vuln_type': 'sql_injection',
'cvss_score': 8.5,
'severity': 'critical',
'affected_line': 1,
}
partial = {
'action_type': 'identify_vulnerability',
'vuln_type': 'xss', # wrong vuln_type
'cvss_score': 8.5, # but correct CVSS
'severity': 'critical', # and correct severity
'affected_line': 1,
}
wrong = {
'action_type': 'identify_vulnerability',
'vuln_type': 'xss', # wrong everything
'cvss_score': 2.0,
'severity': 'low',
'affected_line': 1,
}
s1 = safe_score(sec_cc(perfect, SEC_CASE))
s2 = safe_score(sec_cc(partial, SEC_CASE))
s3 = safe_score(sec_cc(wrong, SEC_CASE))
assert len({round(s, 2) for s in [s1, s2, s3]}) >= 3, f'No variance: {s1},{s2},{s3}'
assert s1 > s2 > s3, f'Wrong ordering: {s1},{s2},{s3}'
print(f' Security identify variance: {s1:.4f} > {s2:.4f} > {s3:.4f} PASS')
def test_dep_resolve_variance():
"""Dependency grader must return different scores for different quality answers."""
case = {
'conflict_packages': ['torch', 'numpy'],
'compatibility_matrix': {
'torch': {'2.1.0': {'numpy': '>=1.24'}, '1.9.0': {}},
'numpy': {'1.24.0': {}, '1.16.0': {}},
},
'requirements': {'torch': '1.9.0', 'numpy': '1.16.0'},
}
full = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.24.0'}, 'reasoning': 'ok'}
part = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.16.0'}, 'reasoning': 'ok'}
empty = {'action_type': 'resolve_conflict', 'packages': {}, 'reasoning': 'ok'}
s1 = safe_score(dep_cc(full, case))
s2 = safe_score(dep_cc(part, case))
s3 = safe_score(dep_cc(empty, case))
assert s1 > s2 >= s3, f'No variance: {s1},{s2},{s3}'
print(f' Dependency resolve variance: {s1:.4f} > {s2:.4f} >= {s3:.4f} PASS')
def test_cli_order_variance():
"""Clinical grader must return different scores for correct vs violated dependency order."""
case = {
'dependency_graph': {
'schedule_surgery': ['resolve_insurance', 'complete_pre_op'],
'complete_pre_op': ['resolve_insurance'],
'resolve_insurance': [],
},
'required_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'],
}
correct = {
'action_type': 'order_steps',
'recovery_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'],
}
violated = {
'action_type': 'order_steps',
'recovery_steps': ['schedule_surgery', 'complete_pre_op', 'resolve_insurance'],
}
partial = {
'action_type': 'order_steps',
'recovery_steps': ['resolve_insurance', 'complete_pre_op'],
}
s1 = safe_score(cli_cc(correct, case))
s2 = safe_score(cli_cc(violated, case))
s3 = safe_score(cli_cc(partial, case))
assert s1 > s2, f'Violation not penalised: correct={s1}, violated={s2}'
assert s1 > s3, f'Completeness not rewarded: correct={s1}, partial={s3}'
print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
def test_safe_score_clamp():
"""
safe_score clamps to [0.01, 0.99] β€” strictly between 0 and 1.
WHY 0.01 not 0.0: The official spec says scores must be strictly > 0.
A score of 0.0 from a crashed run looks indistinguishable
from a broken environment. 0.01 signals "ran but failed".
WHY 0.99 not 1.0: A score of exactly 1.0 means the grader is trivially solved
or broken. 0.99 signals "excellent but not perfect".
"""
# Floor: None, negative, bad types β†’ 0.01
assert safe_score(None) == 0.01, f"Expected 0.01, got {safe_score(None)}"
assert safe_score(-0.5) == 0.01, f"Expected 0.01, got {safe_score(-0.5)}"
assert safe_score(-999) == 0.01, f"Expected 0.01, got {safe_score(-999)}"
assert safe_score('bad') == 0.01, f"Expected 0.01, got {safe_score('bad')}"
assert safe_score([]) == 0.01, f"Expected 0.01, got {safe_score([])}"
# Ceiling: values > 1 β†’ 0.99
assert safe_score(1.5) == 0.99, f"Expected 0.99, got {safe_score(1.5)}"
assert safe_score(2.0) == 0.99, f"Expected 0.99, got {safe_score(2.0)}"
assert safe_score(100) == 0.99, f"Expected 0.99, got {safe_score(100)}"
# Exact boundary values
assert safe_score(0.01) == 0.01, f"Expected 0.01, got {safe_score(0.01)}"
assert safe_score(0.99) == 0.99, f"Expected 0.99, got {safe_score(0.99)}"
# Pass-through: normal values in range stay unchanged
assert safe_score(0.5) == 0.5, f"Expected 0.5, got {safe_score(0.5)}"
assert safe_score(0.85) == 0.85, f"Expected 0.85, got {safe_score(0.85)}"
assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}"
assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}"
print(' safe_score clamp [0.01, 0.99]: PASS')
def test_clinical_valid_actions():
"""Bug 2 fix: propose_recovery must NOT be in clinical VALID_ACTIONS."""
from server.graders.clinical_grader import VALID_ACTIONS
assert 'propose_recovery' not in VALID_ACTIONS, 'Bug 2 still present!'
assert set(VALID_ACTIONS) == {'detect_gap', 'rank_issues', 'order_steps'}
print(' Clinical VALID_ACTIONS (Bug 2): PASS')
if __name__ == '__main__':
test_safe_score_clamp()
test_clinical_valid_actions()
test_sec_identify_variance()
test_dep_resolve_variance()
test_cli_order_variance()
print('\nALL VARIANCE TESTS PASSED βœ…')