Coverage for src/local_deep_research/benchmarks/metrics/reporting.py: 94%

60 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Report generation for benchmark results. 

3 

4This module provides functions for generating detailed reports from benchmark results. 

5""" 

6 

7import json 

8 

9# import logging - replaced with loguru 

10from loguru import logger 

11from datetime import datetime, UTC 

12from typing import Any, Dict, Optional 

13 

14# logger = logging.getLogger(__name__) - using loguru logger directly 

15 

16 

17def generate_report( 

18 metrics: Dict[str, Any], 

19 results_file: str, 

20 output_file: str = "evaluation_report.md", 

21 dataset_name: str = "Unknown", 

22 config_info: Optional[Dict[str, Any]] = None, 

23) -> str: 

24 """ 

25 Generate a detailed report from evaluation results. 

26 

27 Args: 

28 metrics: Dictionary of evaluation metrics 

29 results_file: Path to results file 

30 output_file: Path to save report 

31 dataset_name: Name of dataset 

32 config_info: Optional configuration information 

33 

34 Returns: 

35 Path to the generated report file 

36 """ 

37 # Load a sample of results for examples 

38 results = [] 

39 try: 

40 with open(results_file, "r", encoding="utf-8") as f: 

41 for line in f: 

42 if line.strip(): 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 results.append(json.loads(line)) 

44 except Exception: 

45 logger.exception("Error loading results for report") 

46 results = [] 

47 

48 # Sample up to 5 correct and 5 incorrect examples 

49 correct_examples = [r for r in results if r.get("is_correct", False)][:5] 

50 incorrect_examples = [ 

51 r 

52 for r in results 

53 if "is_correct" in r and not r.get("is_correct", False) 

54 ][:5] 

55 

56 # Create report 

57 report = [ 

58 f"# Evaluation Report: {dataset_name}", 

59 "", 

60 "## Summary", 

61 "", 

62 f"- **Total Examples**: {metrics.get('total_examples', 0)}", 

63 f"- **Graded Examples**: {metrics.get('graded_examples', 0)}", 

64 f"- **Correct Answers**: {metrics.get('correct', 0)}", 

65 f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}", 

66 ] 

67 

68 # Add confidence interval if available 

69 accuracy_ci = metrics.get("accuracy_ci") 

70 if accuracy_ci and accuracy_ci.get("sample_size", 0) > 0: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 report.append( 

72 f"- **95% CI**: [{accuracy_ci['lower']:.3f}, {accuracy_ci['upper']:.3f}] " 

73 f"(Wilson score, n={accuracy_ci['sample_size']})" 

74 ) 

75 

76 if "average_processing_time" in metrics: 

77 report.append( 

78 f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds" 

79 ) 

80 

81 if "average_confidence" in metrics: 

82 report.append( 

83 f"- **Average Confidence**: {metrics['average_confidence']:.2f}%" 

84 ) 

85 

86 if "error_count" in metrics and metrics["error_count"] > 0: 

87 report.append(f"- **Error Count**: {metrics['error_count']}") 

88 report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}") 

89 

90 report.append("") 

91 

92 # Add per-category metrics if available 

93 if "categories" in metrics: 

94 report.extend(["## Category Performance", ""]) 

95 

96 for category, category_metrics in metrics["categories"].items(): 

97 report.append(f"### {category}") 

98 report.append("") 

99 report.append(f"- **Total**: {category_metrics['total']}") 

100 report.append(f"- **Correct**: {category_metrics['correct']}") 

101 report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}") 

102 cat_ci = category_metrics.get("accuracy_ci") 

103 if cat_ci and cat_ci.get("sample_size", 0) > 0: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 report.append( 

105 f"- **95% CI**: [{cat_ci['lower']:.3f}, {cat_ci['upper']:.3f}]" 

106 ) 

107 report.append("") 

108 

109 # Add configuration info if provided 

110 if config_info: 

111 report.extend(["## Configuration", ""]) 

112 

113 for key, value in config_info.items(): 

114 report.append(f"- **{key}**: {value}") 

115 

116 report.append("") 

117 

118 # Add example sections 

119 if correct_examples: 

120 report.extend(["## Example Correct Answers", ""]) 

121 

122 for idx, example in enumerate(correct_examples): 

123 report.extend( 

124 [ 

125 f"### Example {idx + 1}", 

126 "", 

127 f"**Question**: {example.get('problem', '')}", 

128 "", 

129 f"**Correct Answer**: {example.get('correct_answer', '')}", 

130 "", 

131 f"**Model Answer**: {example.get('extracted_answer', '')}", 

132 "", 

133 f"**Reasoning**: {example.get('reasoning', '')}", 

134 "", 

135 ] 

136 ) 

137 

138 if incorrect_examples: 

139 report.extend(["## Example Incorrect Answers", ""]) 

140 

141 for idx, example in enumerate(incorrect_examples): 

142 report.extend( 

143 [ 

144 f"### Example {idx + 1}", 

145 "", 

146 f"**Question**: {example.get('problem', '')}", 

147 "", 

148 f"**Correct Answer**: {example.get('correct_answer', '')}", 

149 "", 

150 f"**Model Answer**: {example.get('extracted_answer', '')}", 

151 "", 

152 f"**Reasoning**: {example.get('reasoning', '')}", 

153 "", 

154 ] 

155 ) 

156 

157 # Add timestamp 

158 timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S") 

159 report.extend( 

160 [ 

161 "## Metadata", 

162 "", 

163 f"- **Generated**: {timestamp}", 

164 f"- **Dataset**: {dataset_name}", 

165 "", 

166 ] 

167 ) 

168 

169 # Write report to file 

170 from ...security.file_write_verifier import write_file_verified 

171 

172 content = "\n".join(report) 

173 write_file_verified( 

174 output_file, 

175 content, 

176 "benchmark.allow_file_output", 

177 context="benchmark report", 

178 ) 

179 

180 logger.info(f"Report saved to {output_file}") 

181 return output_file