Coverage for src/local_deep_research/benchmarks/metrics/statistics.py: 100%

53 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Statistical functions for benchmark evaluation. 

3 

4Provides confidence intervals, sample size calculations, and 

5proportion statistics for interpreting benchmark results. 

6All implementations are pure Python. 

7""" 

8 

9import math 

10 

11 

12def normal_quantile(p: float) -> float: 

13 """ 

14 Inverse normal CDF (percent-point function). 

15 

16 Uses the Beasley-Springer-Moro rational approximation, 

17 accurate to approximately 1e-8. 

18 

19 Args: 

20 p: Probability in (0, 1). 

21 

22 Returns: 

23 z such that P(Z <= z) = p for standard normal Z. 

24 

25 Raises: 

26 ValueError: If p is not in (0, 1). 

27 """ 

28 if p <= 0 or p >= 1: 

29 raise ValueError(f"p must be in (0, 1), got {p}") 

30 

31 # Rational approximation coefficients (Beasley-Springer-Moro) 

32 a = [ 

33 -3.969683028665376e01, 

34 2.209460984245205e02, 

35 -2.759285104469687e02, 

36 1.383577518672690e02, 

37 -3.066479806614716e01, 

38 2.506628277459239e00, 

39 ] 

40 b = [ 

41 -5.447609879822406e01, 

42 1.615858368580409e02, 

43 -1.556989798598866e02, 

44 6.680131188771972e01, 

45 -1.328068155288572e01, 

46 ] 

47 c = [ 

48 -7.784894002430293e-03, 

49 -3.223964580411365e-01, 

50 -2.400758277161838e00, 

51 -2.549732539343734e00, 

52 4.374664141464968e00, 

53 2.938163982698783e00, 

54 ] 

55 d = [ 

56 7.784695709041462e-03, 

57 3.224671290700398e-01, 

58 2.445134137142996e00, 

59 3.754408661907416e00, 

60 ] 

61 

62 p_low = 0.02425 

63 p_high = 1 - p_low 

64 

65 if p < p_low: 

66 # Lower tail 

67 q = math.sqrt(-2 * math.log(p)) 

68 return ( 

69 ((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5] 

70 ) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1) 

71 if p <= p_high: 

72 # Central region 

73 q = p - 0.5 

74 r = q * q 

75 return ( 

76 ( 

77 ((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r 

78 + a[5] 

79 ) 

80 * q 

81 ) / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1) 

82 # Upper tail 

83 q = math.sqrt(-2 * math.log(1 - p)) 

84 return -( 

85 ((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5] 

86 ) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1) 

87 

88 

89def wilson_score_interval( 

90 successes: int, 

91 total: int, 

92 confidence: float = 0.95, 

93) -> dict: 

94 """ 

95 Wilson score confidence interval for a binomial proportion. 

96 

97 Unlike the normal (Wald) approximation, the Wilson interval: 

98 - Never produces bounds outside [0, 1] 

99 - Behaves correctly at 0% and 100% accuracy 

100 - Has better coverage at small sample sizes 

101 

102 Args: 

103 successes: Number of successes (correct answers). 

104 total: Total number of trials (graded examples). 

105 confidence: Confidence level (default 0.95 for 95% CI). 

106 

107 Returns: 

108 Dictionary with keys: 

109 - lower: Lower bound of the interval 

110 - upper: Upper bound of the interval 

111 - center: Center of the Wilson interval (not the raw proportion) 

112 - margin_of_error: Half-width of the interval 

113 - sample_size: The total n used 

114 """ 

115 if total <= 0: 

116 return { 

117 "lower": 0.0, 

118 "upper": 0.0, 

119 "center": 0.0, 

120 "margin_of_error": 0.0, 

121 "sample_size": 0, 

122 } 

123 

124 if successes < 0 or successes > total: 

125 raise ValueError( 

126 f"successes must be in [0, total], got successes={successes}, total={total}" 

127 ) 

128 

129 p_hat = successes / total 

130 z = normal_quantile(1 - (1 - confidence) / 2) 

131 z2 = z * z 

132 

133 denominator = 1 + z2 / total 

134 center = (p_hat + z2 / (2 * total)) / denominator 

135 half_width = ( 

136 z * math.sqrt(p_hat * (1 - p_hat) / total + z2 / (4 * total * total)) 

137 ) / denominator 

138 

139 lower = max(0.0, center - half_width) 

140 upper = min(1.0, center + half_width) 

141 

142 return { 

143 "lower": lower, 

144 "upper": upper, 

145 "center": center, 

146 "margin_of_error": half_width, 

147 "sample_size": total, 

148 } 

149 

150 

151def proportion_std_error(p: float, n: int) -> float: 

152 """ 

153 Standard error of a sample proportion. 

154 

155 Args: 

156 p: Observed proportion (0 to 1). 

157 n: Sample size. 

158 

159 Returns: 

160 Standard error, or 0.0 if n <= 0. 

161 """ 

162 if n <= 0: 

163 return 0.0 

164 if p < 0 or p > 1: 

165 raise ValueError(f"p must be in [0, 1], got {p}") 

166 return math.sqrt(p * (1 - p) / n) 

167 

168 

169def sample_size_for_difference( 

170 p1: float, 

171 p2: float, 

172 power: float = 0.8, 

173 alpha: float = 0.05, 

174) -> int: 

175 """ 

176 Required sample size per group to detect a difference between 

177 two independent proportions. 

178 

179 Uses the formula for a two-sided two-proportion z-test: 

180 n = (z_alpha/2 + z_beta)^2 * (p1(1-p1) + p2(1-p2)) / (p1 - p2)^2 

181 

182 Args: 

183 p1: Expected proportion for group 1. 

184 p2: Expected proportion for group 2. 

185 power: Statistical power (default 0.8 = 80%). 

186 alpha: Significance level (default 0.05 for two-sided test). 

187 

188 Returns: 

189 Required sample size per group (rounded up). 

190 

191 Raises: 

192 ValueError: If p1 == p2 (no difference to detect). 

193 """ 

194 if p1 == p2: 

195 raise ValueError( 

196 "p1 and p2 must differ to compute required sample size" 

197 ) 

198 for name, val in [("p1", p1), ("p2", p2)]: 

199 if val < 0 or val > 1: 

200 raise ValueError(f"{name} must be in [0, 1], got {val}") 

201 for name, val in [("power", power), ("alpha", alpha)]: 

202 if val <= 0 or val >= 1: 

203 raise ValueError(f"{name} must be in (0, 1), got {val}") 

204 

205 z_alpha = normal_quantile(1 - alpha / 2) 

206 z_beta = normal_quantile(power) 

207 

208 numerator = (z_alpha + z_beta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2)) 

209 denominator = (p1 - p2) ** 2 

210 

211 return math.ceil(numerator / denominator)