Coverage for src/local_deep_research/benchmarks/metrics/reporting.py: 94%
60 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Report generation for benchmark results.
4This module provides functions for generating detailed reports from benchmark results.
5"""
7import json
9# import logging - replaced with loguru
10from loguru import logger
11from datetime import datetime, UTC
12from typing import Any, Dict, Optional
14# logger = logging.getLogger(__name__) - using loguru logger directly
17def generate_report(
18 metrics: Dict[str, Any],
19 results_file: str,
20 output_file: str = "evaluation_report.md",
21 dataset_name: str = "Unknown",
22 config_info: Optional[Dict[str, Any]] = None,
23) -> str:
24 """
25 Generate a detailed report from evaluation results.
27 Args:
28 metrics: Dictionary of evaluation metrics
29 results_file: Path to results file
30 output_file: Path to save report
31 dataset_name: Name of dataset
32 config_info: Optional configuration information
34 Returns:
35 Path to the generated report file
36 """
37 # Load a sample of results for examples
38 results = []
39 try:
40 with open(results_file, "r", encoding="utf-8") as f:
41 for line in f:
42 if line.strip(): 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true
43 results.append(json.loads(line))
44 except Exception:
45 logger.exception("Error loading results for report")
46 results = []
48 # Sample up to 5 correct and 5 incorrect examples
49 correct_examples = [r for r in results if r.get("is_correct", False)][:5]
50 incorrect_examples = [
51 r
52 for r in results
53 if "is_correct" in r and not r.get("is_correct", False)
54 ][:5]
56 # Create report
57 report = [
58 f"# Evaluation Report: {dataset_name}",
59 "",
60 "## Summary",
61 "",
62 f"- **Total Examples**: {metrics.get('total_examples', 0)}",
63 f"- **Graded Examples**: {metrics.get('graded_examples', 0)}",
64 f"- **Correct Answers**: {metrics.get('correct', 0)}",
65 f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}",
66 ]
68 # Add confidence interval if available
69 accuracy_ci = metrics.get("accuracy_ci")
70 if accuracy_ci and accuracy_ci.get("sample_size", 0) > 0: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 report.append(
72 f"- **95% CI**: [{accuracy_ci['lower']:.3f}, {accuracy_ci['upper']:.3f}] "
73 f"(Wilson score, n={accuracy_ci['sample_size']})"
74 )
76 if "average_processing_time" in metrics:
77 report.append(
78 f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds"
79 )
81 if "average_confidence" in metrics:
82 report.append(
83 f"- **Average Confidence**: {metrics['average_confidence']:.2f}%"
84 )
86 if "error_count" in metrics and metrics["error_count"] > 0:
87 report.append(f"- **Error Count**: {metrics['error_count']}")
88 report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}")
90 report.append("")
92 # Add per-category metrics if available
93 if "categories" in metrics:
94 report.extend(["## Category Performance", ""])
96 for category, category_metrics in metrics["categories"].items():
97 report.append(f"### {category}")
98 report.append("")
99 report.append(f"- **Total**: {category_metrics['total']}")
100 report.append(f"- **Correct**: {category_metrics['correct']}")
101 report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}")
102 cat_ci = category_metrics.get("accuracy_ci")
103 if cat_ci and cat_ci.get("sample_size", 0) > 0: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 report.append(
105 f"- **95% CI**: [{cat_ci['lower']:.3f}, {cat_ci['upper']:.3f}]"
106 )
107 report.append("")
109 # Add configuration info if provided
110 if config_info:
111 report.extend(["## Configuration", ""])
113 for key, value in config_info.items():
114 report.append(f"- **{key}**: {value}")
116 report.append("")
118 # Add example sections
119 if correct_examples:
120 report.extend(["## Example Correct Answers", ""])
122 for idx, example in enumerate(correct_examples):
123 report.extend(
124 [
125 f"### Example {idx + 1}",
126 "",
127 f"**Question**: {example.get('problem', '')}",
128 "",
129 f"**Correct Answer**: {example.get('correct_answer', '')}",
130 "",
131 f"**Model Answer**: {example.get('extracted_answer', '')}",
132 "",
133 f"**Reasoning**: {example.get('reasoning', '')}",
134 "",
135 ]
136 )
138 if incorrect_examples:
139 report.extend(["## Example Incorrect Answers", ""])
141 for idx, example in enumerate(incorrect_examples):
142 report.extend(
143 [
144 f"### Example {idx + 1}",
145 "",
146 f"**Question**: {example.get('problem', '')}",
147 "",
148 f"**Correct Answer**: {example.get('correct_answer', '')}",
149 "",
150 f"**Model Answer**: {example.get('extracted_answer', '')}",
151 "",
152 f"**Reasoning**: {example.get('reasoning', '')}",
153 "",
154 ]
155 )
157 # Add timestamp
158 timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S")
159 report.extend(
160 [
161 "## Metadata",
162 "",
163 f"- **Generated**: {timestamp}",
164 f"- **Dataset**: {dataset_name}",
165 "",
166 ]
167 )
169 # Write report to file
170 from ...security.file_write_verifier import write_file_verified
172 content = "\n".join(report)
173 write_file_verified(
174 output_file,
175 content,
176 "benchmark.allow_file_output",
177 context="benchmark report",
178 )
180 logger.info(f"Report saved to {output_file}")
181 return output_file