Coverage for src/local_deep_research/benchmarks/metrics/reporting.py: 99%

1"""

2Report generation for benchmark results.

4This module provides functions for generating detailed reports from benchmark results.

5"""

7import json

9# import logging - replaced with loguru

10from loguru import logger

11from datetime import datetime, UTC

12from typing import Any, Dict, Optional

14# logger = logging.getLogger(__name__) - using loguru logger directly

17def generate_report(

18 metrics: Dict[str, Any],

19 results_file: str,

20 output_file: str = "evaluation_report.md",

21 dataset_name: str = "Unknown",

22 config_info: Optional[Dict[str, Any]] = None,

23) -> str:

24 """

25 Generate a detailed report from evaluation results.

27 Args:

28 metrics: Dictionary of evaluation metrics

29 results_file: Path to results file

30 output_file: Path to save report

31 dataset_name: Name of dataset

32 config_info: Optional configuration information

34 Returns:

35 Path to the generated report file

36 """

37 # Load a sample of results for examples

38 results = []

39 try:

40 with open(results_file, "r") as f:

41 for line in f:

42 if line.strip(): 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 results.append(json.loads(line))

44 except Exception:

45 logger.exception("Error loading results for report")

46 results = []

48 # Sample up to 5 correct and 5 incorrect examples

49 correct_examples = [r for r in results if r.get("is_correct", False)][:5]

50 incorrect_examples = [

51 r

52 for r in results

53 if "is_correct" in r and not r.get("is_correct", False)

54 ][:5]

56 # Create report

57 report = [

58 f"# Evaluation Report: {dataset_name}",

59 "",

60 "## Summary",

61 "",

62 f"- **Total Examples**: {metrics.get('total_examples', 0)}",

63 f"- **Graded Examples**: {metrics.get('graded_examples', 0)}",

64 f"- **Correct Answers**: {metrics.get('correct', 0)}",

65 f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}",

66 ]

68 if "average_processing_time" in metrics:

69 report.append(

70 f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds"

71 )

73 if "average_confidence" in metrics:

74 report.append(

75 f"- **Average Confidence**: {metrics['average_confidence']:.2f}%"

76 )

78 if "error_count" in metrics and metrics["error_count"] > 0:

79 report.append(f"- **Error Count**: {metrics['error_count']}")

80 report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}")

82 report.append("")

84 # Add per-category metrics if available

85 if "categories" in metrics:

86 report.extend(["## Category Performance", ""])

88 for category, category_metrics in metrics["categories"].items():

89 report.append(f"### {category}")

90 report.append("")

91 report.append(f"- **Total**: {category_metrics['total']}")

92 report.append(f"- **Correct**: {category_metrics['correct']}")

93 report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}")

94 report.append("")

96 # Add configuration info if provided

97 if config_info:

98 report.extend(["## Configuration", ""])

100 for key, value in config_info.items():

101 report.append(f"- **{key}**: {value}")

102

103 report.append("")

104

105 # Add example sections

106 if correct_examples:

107 report.extend(["## Example Correct Answers", ""])

108

109 for idx, example in enumerate(correct_examples):

110 report.extend(

111 [

112 f"### Example {idx + 1}",

113 "",

114 f"**Question**: {example.get('problem', '')}",

115 "",

116 f"**Correct Answer**: {example.get('correct_answer', '')}",

117 "",

118 f"**Model Answer**: {example.get('extracted_answer', '')}",

119 "",

120 f"**Reasoning**: {example.get('reasoning', '')}",

121 "",

122 ]

123 )

124

125 if incorrect_examples:

126 report.extend(["## Example Incorrect Answers", ""])

127

128 for idx, example in enumerate(incorrect_examples):

129 report.extend(

130 [

131 f"### Example {idx + 1}",

132 "",

133 f"**Question**: {example.get('problem', '')}",

134 "",

135 f"**Correct Answer**: {example.get('correct_answer', '')}",

136 "",

137 f"**Model Answer**: {example.get('extracted_answer', '')}",

138 "",

139 f"**Reasoning**: {example.get('reasoning', '')}",

140 "",

141 ]

142 )

143

144 # Add timestamp

145 timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S")

146 report.extend(

147 [

148 "## Metadata",

149 "",

150 f"- **Generated**: {timestamp}",

151 f"- **Dataset**: {dataset_name}",

152 "",

153 ]

154 )

155

156 # Write report to file

157 from ...security.file_write_verifier import write_file_verified

158

159 content = "\n".join(report)

160 write_file_verified(

161 output_file,

162 content,

163 "benchmark.allow_file_output",

164 context="benchmark report",

165 )

166

167 logger.info(f"Report saved to {output_file}")

168 return output_file

Coverage for src / local_deep_research / benchmarks / metrics / reporting.py: 99%

54 statements