Coverage for src / local_deep_research / benchmarks / metrics / reporting.py: 99%

54 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Report generation for benchmark results. 

3 

4This module provides functions for generating detailed reports from benchmark results. 

5""" 

6 

7import json 

8 

9# import logging - replaced with loguru 

10from loguru import logger 

11from datetime import datetime, UTC 

12from typing import Any, Dict, Optional 

13 

14# logger = logging.getLogger(__name__) - using loguru logger directly 

15 

16 

17def generate_report( 

18 metrics: Dict[str, Any], 

19 results_file: str, 

20 output_file: str = "evaluation_report.md", 

21 dataset_name: str = "Unknown", 

22 config_info: Optional[Dict[str, Any]] = None, 

23) -> str: 

24 """ 

25 Generate a detailed report from evaluation results. 

26 

27 Args: 

28 metrics: Dictionary of evaluation metrics 

29 results_file: Path to results file 

30 output_file: Path to save report 

31 dataset_name: Name of dataset 

32 config_info: Optional configuration information 

33 

34 Returns: 

35 Path to the generated report file 

36 """ 

37 # Load a sample of results for examples 

38 results = [] 

39 try: 

40 with open(results_file, "r") as f: 

41 for line in f: 

42 if line.strip(): 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 results.append(json.loads(line)) 

44 except Exception: 

45 logger.exception("Error loading results for report") 

46 results = [] 

47 

48 # Sample up to 5 correct and 5 incorrect examples 

49 correct_examples = [r for r in results if r.get("is_correct", False)][:5] 

50 incorrect_examples = [ 

51 r 

52 for r in results 

53 if "is_correct" in r and not r.get("is_correct", False) 

54 ][:5] 

55 

56 # Create report 

57 report = [ 

58 f"# Evaluation Report: {dataset_name}", 

59 "", 

60 "## Summary", 

61 "", 

62 f"- **Total Examples**: {metrics.get('total_examples', 0)}", 

63 f"- **Graded Examples**: {metrics.get('graded_examples', 0)}", 

64 f"- **Correct Answers**: {metrics.get('correct', 0)}", 

65 f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}", 

66 ] 

67 

68 if "average_processing_time" in metrics: 

69 report.append( 

70 f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds" 

71 ) 

72 

73 if "average_confidence" in metrics: 

74 report.append( 

75 f"- **Average Confidence**: {metrics['average_confidence']:.2f}%" 

76 ) 

77 

78 if "error_count" in metrics and metrics["error_count"] > 0: 

79 report.append(f"- **Error Count**: {metrics['error_count']}") 

80 report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}") 

81 

82 report.append("") 

83 

84 # Add per-category metrics if available 

85 if "categories" in metrics: 

86 report.extend(["## Category Performance", ""]) 

87 

88 for category, category_metrics in metrics["categories"].items(): 

89 report.append(f"### {category}") 

90 report.append("") 

91 report.append(f"- **Total**: {category_metrics['total']}") 

92 report.append(f"- **Correct**: {category_metrics['correct']}") 

93 report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}") 

94 report.append("") 

95 

96 # Add configuration info if provided 

97 if config_info: 

98 report.extend(["## Configuration", ""]) 

99 

100 for key, value in config_info.items(): 

101 report.append(f"- **{key}**: {value}") 

102 

103 report.append("") 

104 

105 # Add example sections 

106 if correct_examples: 

107 report.extend(["## Example Correct Answers", ""]) 

108 

109 for idx, example in enumerate(correct_examples): 

110 report.extend( 

111 [ 

112 f"### Example {idx + 1}", 

113 "", 

114 f"**Question**: {example.get('problem', '')}", 

115 "", 

116 f"**Correct Answer**: {example.get('correct_answer', '')}", 

117 "", 

118 f"**Model Answer**: {example.get('extracted_answer', '')}", 

119 "", 

120 f"**Reasoning**: {example.get('reasoning', '')}", 

121 "", 

122 ] 

123 ) 

124 

125 if incorrect_examples: 

126 report.extend(["## Example Incorrect Answers", ""]) 

127 

128 for idx, example in enumerate(incorrect_examples): 

129 report.extend( 

130 [ 

131 f"### Example {idx + 1}", 

132 "", 

133 f"**Question**: {example.get('problem', '')}", 

134 "", 

135 f"**Correct Answer**: {example.get('correct_answer', '')}", 

136 "", 

137 f"**Model Answer**: {example.get('extracted_answer', '')}", 

138 "", 

139 f"**Reasoning**: {example.get('reasoning', '')}", 

140 "", 

141 ] 

142 ) 

143 

144 # Add timestamp 

145 timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S") 

146 report.extend( 

147 [ 

148 "## Metadata", 

149 "", 

150 f"- **Generated**: {timestamp}", 

151 f"- **Dataset**: {dataset_name}", 

152 "", 

153 ] 

154 ) 

155 

156 # Write report to file 

157 from ...security.file_write_verifier import write_file_verified 

158 

159 content = "\n".join(report) 

160 write_file_verified( 

161 output_file, 

162 content, 

163 "benchmark.allow_file_output", 

164 context="benchmark report", 

165 ) 

166 

167 logger.info(f"Report saved to {output_file}") 

168 return output_file