Coverage for src / local_deep_research / benchmarks / metrics / reporting.py: 99%
54 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Report generation for benchmark results.
4This module provides functions for generating detailed reports from benchmark results.
5"""
7import json
9# import logging - replaced with loguru
10from loguru import logger
11from datetime import datetime, UTC
12from typing import Any, Dict, Optional
14# logger = logging.getLogger(__name__) - using loguru logger directly
17def generate_report(
18 metrics: Dict[str, Any],
19 results_file: str,
20 output_file: str = "evaluation_report.md",
21 dataset_name: str = "Unknown",
22 config_info: Optional[Dict[str, Any]] = None,
23) -> str:
24 """
25 Generate a detailed report from evaluation results.
27 Args:
28 metrics: Dictionary of evaluation metrics
29 results_file: Path to results file
30 output_file: Path to save report
31 dataset_name: Name of dataset
32 config_info: Optional configuration information
34 Returns:
35 Path to the generated report file
36 """
37 # Load a sample of results for examples
38 results = []
39 try:
40 with open(results_file, "r") as f:
41 for line in f:
42 if line.strip(): 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true
43 results.append(json.loads(line))
44 except Exception:
45 logger.exception("Error loading results for report")
46 results = []
48 # Sample up to 5 correct and 5 incorrect examples
49 correct_examples = [r for r in results if r.get("is_correct", False)][:5]
50 incorrect_examples = [
51 r
52 for r in results
53 if "is_correct" in r and not r.get("is_correct", False)
54 ][:5]
56 # Create report
57 report = [
58 f"# Evaluation Report: {dataset_name}",
59 "",
60 "## Summary",
61 "",
62 f"- **Total Examples**: {metrics.get('total_examples', 0)}",
63 f"- **Graded Examples**: {metrics.get('graded_examples', 0)}",
64 f"- **Correct Answers**: {metrics.get('correct', 0)}",
65 f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}",
66 ]
68 if "average_processing_time" in metrics:
69 report.append(
70 f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds"
71 )
73 if "average_confidence" in metrics:
74 report.append(
75 f"- **Average Confidence**: {metrics['average_confidence']:.2f}%"
76 )
78 if "error_count" in metrics and metrics["error_count"] > 0:
79 report.append(f"- **Error Count**: {metrics['error_count']}")
80 report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}")
82 report.append("")
84 # Add per-category metrics if available
85 if "categories" in metrics:
86 report.extend(["## Category Performance", ""])
88 for category, category_metrics in metrics["categories"].items():
89 report.append(f"### {category}")
90 report.append("")
91 report.append(f"- **Total**: {category_metrics['total']}")
92 report.append(f"- **Correct**: {category_metrics['correct']}")
93 report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}")
94 report.append("")
96 # Add configuration info if provided
97 if config_info:
98 report.extend(["## Configuration", ""])
100 for key, value in config_info.items():
101 report.append(f"- **{key}**: {value}")
103 report.append("")
105 # Add example sections
106 if correct_examples:
107 report.extend(["## Example Correct Answers", ""])
109 for idx, example in enumerate(correct_examples):
110 report.extend(
111 [
112 f"### Example {idx + 1}",
113 "",
114 f"**Question**: {example.get('problem', '')}",
115 "",
116 f"**Correct Answer**: {example.get('correct_answer', '')}",
117 "",
118 f"**Model Answer**: {example.get('extracted_answer', '')}",
119 "",
120 f"**Reasoning**: {example.get('reasoning', '')}",
121 "",
122 ]
123 )
125 if incorrect_examples:
126 report.extend(["## Example Incorrect Answers", ""])
128 for idx, example in enumerate(incorrect_examples):
129 report.extend(
130 [
131 f"### Example {idx + 1}",
132 "",
133 f"**Question**: {example.get('problem', '')}",
134 "",
135 f"**Correct Answer**: {example.get('correct_answer', '')}",
136 "",
137 f"**Model Answer**: {example.get('extracted_answer', '')}",
138 "",
139 f"**Reasoning**: {example.get('reasoning', '')}",
140 "",
141 ]
142 )
144 # Add timestamp
145 timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S")
146 report.extend(
147 [
148 "## Metadata",
149 "",
150 f"- **Generated**: {timestamp}",
151 f"- **Dataset**: {dataset_name}",
152 "",
153 ]
154 )
156 # Write report to file
157 from ...security.file_write_verifier import write_file_verified
159 content = "\n".join(report)
160 write_file_verified(
161 output_file,
162 content,
163 "benchmark.allow_file_output",
164 context="benchmark report",
165 )
167 logger.info(f"Report saved to {output_file}")
168 return output_file