Coverage for src/local_deep_research/benchmarks/metrics/statistics.py: 100%
53 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Statistical functions for benchmark evaluation.
4Provides confidence intervals, sample size calculations, and
5proportion statistics for interpreting benchmark results.
6All implementations are pure Python.
7"""
9import math
12def normal_quantile(p: float) -> float:
13 """
14 Inverse normal CDF (percent-point function).
16 Uses the Beasley-Springer-Moro rational approximation,
17 accurate to approximately 1e-8.
19 Args:
20 p: Probability in (0, 1).
22 Returns:
23 z such that P(Z <= z) = p for standard normal Z.
25 Raises:
26 ValueError: If p is not in (0, 1).
27 """
28 if p <= 0 or p >= 1:
29 raise ValueError(f"p must be in (0, 1), got {p}")
31 # Rational approximation coefficients (Beasley-Springer-Moro)
32 a = [
33 -3.969683028665376e01,
34 2.209460984245205e02,
35 -2.759285104469687e02,
36 1.383577518672690e02,
37 -3.066479806614716e01,
38 2.506628277459239e00,
39 ]
40 b = [
41 -5.447609879822406e01,
42 1.615858368580409e02,
43 -1.556989798598866e02,
44 6.680131188771972e01,
45 -1.328068155288572e01,
46 ]
47 c = [
48 -7.784894002430293e-03,
49 -3.223964580411365e-01,
50 -2.400758277161838e00,
51 -2.549732539343734e00,
52 4.374664141464968e00,
53 2.938163982698783e00,
54 ]
55 d = [
56 7.784695709041462e-03,
57 3.224671290700398e-01,
58 2.445134137142996e00,
59 3.754408661907416e00,
60 ]
62 p_low = 0.02425
63 p_high = 1 - p_low
65 if p < p_low:
66 # Lower tail
67 q = math.sqrt(-2 * math.log(p))
68 return (
69 ((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]
70 ) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)
71 if p <= p_high:
72 # Central region
73 q = p - 0.5
74 r = q * q
75 return (
76 (
77 ((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r
78 + a[5]
79 )
80 * q
81 ) / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1)
82 # Upper tail
83 q = math.sqrt(-2 * math.log(1 - p))
84 return -(
85 ((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]
86 ) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)
89def wilson_score_interval(
90 successes: int,
91 total: int,
92 confidence: float = 0.95,
93) -> dict:
94 """
95 Wilson score confidence interval for a binomial proportion.
97 Unlike the normal (Wald) approximation, the Wilson interval:
98 - Never produces bounds outside [0, 1]
99 - Behaves correctly at 0% and 100% accuracy
100 - Has better coverage at small sample sizes
102 Args:
103 successes: Number of successes (correct answers).
104 total: Total number of trials (graded examples).
105 confidence: Confidence level (default 0.95 for 95% CI).
107 Returns:
108 Dictionary with keys:
109 - lower: Lower bound of the interval
110 - upper: Upper bound of the interval
111 - center: Center of the Wilson interval (not the raw proportion)
112 - margin_of_error: Half-width of the interval
113 - sample_size: The total n used
114 """
115 if total <= 0:
116 return {
117 "lower": 0.0,
118 "upper": 0.0,
119 "center": 0.0,
120 "margin_of_error": 0.0,
121 "sample_size": 0,
122 }
124 if successes < 0 or successes > total:
125 raise ValueError(
126 f"successes must be in [0, total], got successes={successes}, total={total}"
127 )
129 p_hat = successes / total
130 z = normal_quantile(1 - (1 - confidence) / 2)
131 z2 = z * z
133 denominator = 1 + z2 / total
134 center = (p_hat + z2 / (2 * total)) / denominator
135 half_width = (
136 z * math.sqrt(p_hat * (1 - p_hat) / total + z2 / (4 * total * total))
137 ) / denominator
139 lower = max(0.0, center - half_width)
140 upper = min(1.0, center + half_width)
142 return {
143 "lower": lower,
144 "upper": upper,
145 "center": center,
146 "margin_of_error": half_width,
147 "sample_size": total,
148 }
151def proportion_std_error(p: float, n: int) -> float:
152 """
153 Standard error of a sample proportion.
155 Args:
156 p: Observed proportion (0 to 1).
157 n: Sample size.
159 Returns:
160 Standard error, or 0.0 if n <= 0.
161 """
162 if n <= 0:
163 return 0.0
164 if p < 0 or p > 1:
165 raise ValueError(f"p must be in [0, 1], got {p}")
166 return math.sqrt(p * (1 - p) / n)
169def sample_size_for_difference(
170 p1: float,
171 p2: float,
172 power: float = 0.8,
173 alpha: float = 0.05,
174) -> int:
175 """
176 Required sample size per group to detect a difference between
177 two independent proportions.
179 Uses the formula for a two-sided two-proportion z-test:
180 n = (z_alpha/2 + z_beta)^2 * (p1(1-p1) + p2(1-p2)) / (p1 - p2)^2
182 Args:
183 p1: Expected proportion for group 1.
184 p2: Expected proportion for group 2.
185 power: Statistical power (default 0.8 = 80%).
186 alpha: Significance level (default 0.05 for two-sided test).
188 Returns:
189 Required sample size per group (rounded up).
191 Raises:
192 ValueError: If p1 == p2 (no difference to detect).
193 """
194 if p1 == p2:
195 raise ValueError(
196 "p1 and p2 must differ to compute required sample size"
197 )
198 for name, val in [("p1", p1), ("p2", p2)]:
199 if val < 0 or val > 1:
200 raise ValueError(f"{name} must be in [0, 1], got {val}")
201 for name, val in [("power", power), ("alpha", alpha)]:
202 if val <= 0 or val >= 1:
203 raise ValueError(f"{name} must be in (0, 1), got {val}")
205 z_alpha = normal_quantile(1 - alpha / 2)
206 z_beta = normal_quantile(power)
208 numerator = (z_alpha + z_beta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2))
209 denominator = (p1 - p2) ** 2
211 return math.ceil(numerator / denominator)