Coverage for src/local_deep_research/benchmarks/metrics/statistics.py: 100%

1"""

2Statistical functions for benchmark evaluation.

4Provides confidence intervals, sample size calculations, and

5proportion statistics for interpreting benchmark results.

6All implementations are pure Python.

7"""

9import math

12def normal_quantile(p: float) -> float:

13 """

14 Inverse normal CDF (percent-point function).

16 Uses the Beasley-Springer-Moro rational approximation,

17 accurate to approximately 1e-8.

19 Args:

20 p: Probability in (0, 1).

22 Returns:

23 z such that P(Z <= z) = p for standard normal Z.

25 Raises:

26 ValueError: If p is not in (0, 1).

27 """

28 if p <= 0 or p >= 1:

29 raise ValueError(f"p must be in (0, 1), got {p}")

31 # Rational approximation coefficients (Beasley-Springer-Moro)

32 a = [

33 -3.969683028665376e01,

34 2.209460984245205e02,

35 -2.759285104469687e02,

36 1.383577518672690e02,

37 -3.066479806614716e01,

38 2.506628277459239e00,

39 ]

40 b = [

41 -5.447609879822406e01,

42 1.615858368580409e02,

43 -1.556989798598866e02,

44 6.680131188771972e01,

45 -1.328068155288572e01,

46 ]

47 c = [

48 -7.784894002430293e-03,

49 -3.223964580411365e-01,

50 -2.400758277161838e00,

51 -2.549732539343734e00,

52 4.374664141464968e00,

53 2.938163982698783e00,

54 ]

55 d = [

56 7.784695709041462e-03,

57 3.224671290700398e-01,

58 2.445134137142996e00,

59 3.754408661907416e00,

60 ]

62 p_low = 0.02425

63 p_high = 1 - p_low

65 if p < p_low:

66 # Lower tail

67 q = math.sqrt(-2 * math.log(p))

68 return (

69 ((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]

70 ) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)

71 if p <= p_high:

72 # Central region

73 q = p - 0.5

74 r = q * q

75 return (

76 (

77 ((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r

78 + a[5]

79 )

80 * q

81 ) / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1)

82 # Upper tail

83 q = math.sqrt(-2 * math.log(1 - p))

84 return -(

85 ((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]

86 ) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)

89def wilson_score_interval(

90 successes: int,

91 total: int,

92 confidence: float = 0.95,

93) -> dict:

94 """

95 Wilson score confidence interval for a binomial proportion.

97 Unlike the normal (Wald) approximation, the Wilson interval:

98 - Never produces bounds outside [0, 1]

99 - Behaves correctly at 0% and 100% accuracy

100 - Has better coverage at small sample sizes

101

102 Args:

103 successes: Number of successes (correct answers).

104 total: Total number of trials (graded examples).

105 confidence: Confidence level (default 0.95 for 95% CI).

106

107 Returns:

108 Dictionary with keys:

109 - lower: Lower bound of the interval

110 - upper: Upper bound of the interval

111 - center: Center of the Wilson interval (not the raw proportion)

112 - margin_of_error: Half-width of the interval

113 - sample_size: The total n used

114 """

115 if total <= 0:

116 return {

117 "lower": 0.0,

118 "upper": 0.0,

119 "center": 0.0,

120 "margin_of_error": 0.0,

121 "sample_size": 0,

122 }

123

124 if successes < 0 or successes > total:

125 raise ValueError(

126 f"successes must be in [0, total], got successes={successes}, total={total}"

127 )

128

129 p_hat = successes / total

130 z = normal_quantile(1 - (1 - confidence) / 2)

131 z2 = z * z

132

133 denominator = 1 + z2 / total

134 center = (p_hat + z2 / (2 * total)) / denominator

135 half_width = (

136 z * math.sqrt(p_hat * (1 - p_hat) / total + z2 / (4 * total * total))

137 ) / denominator

138

139 lower = max(0.0, center - half_width)

140 upper = min(1.0, center + half_width)

141

142 return {

143 "lower": lower,

144 "upper": upper,

145 "center": center,

146 "margin_of_error": half_width,

147 "sample_size": total,

148 }

149

150

151def proportion_std_error(p: float, n: int) -> float:

152 """

153 Standard error of a sample proportion.

154

155 Args:

156 p: Observed proportion (0 to 1).

157 n: Sample size.

158

159 Returns:

160 Standard error, or 0.0 if n <= 0.

161 """

162 if n <= 0:

163 return 0.0

164 if p < 0 or p > 1:

165 raise ValueError(f"p must be in [0, 1], got {p}")

166 return math.sqrt(p * (1 - p) / n)

167

168

169def sample_size_for_difference(

170 p1: float,

171 p2: float,

172 power: float = 0.8,

173 alpha: float = 0.05,

174) -> int:

175 """

176 Required sample size per group to detect a difference between

177 two independent proportions.

178

179 Uses the formula for a two-sided two-proportion z-test:

180 n = (z_alpha/2 + z_beta)^2 * (p1(1-p1) + p2(1-p2)) / (p1 - p2)^2

181

182 Args:

183 p1: Expected proportion for group 1.

184 p2: Expected proportion for group 2.

185 power: Statistical power (default 0.8 = 80%).

186 alpha: Significance level (default 0.05 for two-sided test).

187

188 Returns:

189 Required sample size per group (rounded up).

190

191 Raises:

192 ValueError: If p1 == p2 (no difference to detect).

193 """

194 if p1 == p2:

195 raise ValueError(

196 "p1 and p2 must differ to compute required sample size"

197 )

198 for name, val in [("p1", p1), ("p2", p2)]:

199 if val < 0 or val > 1:

200 raise ValueError(f"{name} must be in [0, 1], got {val}")

201 for name, val in [("power", power), ("alpha", alpha)]:

202 if val <= 0 or val >= 1:

203 raise ValueError(f"{name} must be in (0, 1), got {val}")

204

205 z_alpha = normal_quantile(1 - alpha / 2)

206 z_beta = normal_quantile(power)

207

208 numerator = (z_alpha + z_beta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2))

209 denominator = (p1 - p2) ** 2

210

211 return math.ceil(numerator / denominator)