Coverage for src / local_deep_research / database / models / benchmark.py: 100%
101 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""Database models for benchmark system."""
3import enum
5from sqlalchemy import (
6 JSON,
7 Boolean,
8 Column,
9 Enum,
10 Float,
11 ForeignKey,
12 Index,
13 Integer,
14 String,
15 Text,
16 UniqueConstraint,
17)
18from sqlalchemy.orm import relationship
19from sqlalchemy_utc import UtcDateTime, utcnow
22# Use the same base as the main app
23from . import Base
26class BenchmarkStatus(enum.Enum):
27 """Status of a benchmark run."""
29 PENDING = "pending"
30 IN_PROGRESS = "in_progress"
31 COMPLETED = "completed"
32 FAILED = "failed"
33 CANCELLED = "cancelled"
34 PAUSED = "paused"
37class DatasetType(enum.Enum):
38 """Supported dataset types."""
40 SIMPLEQA = "simpleqa"
41 BROWSECOMP = "browsecomp"
42 XBENCH_DEEPSEARCH = "xbench_deepsearch"
43 CUSTOM = "custom"
46class BenchmarkRun(Base):
47 """Main benchmark run metadata."""
49 __tablename__ = "benchmark_runs"
51 id = Column(Integer, primary_key=True, index=True)
53 # Run identification
54 run_name = Column(String(255), nullable=True) # User-friendly name
55 config_hash = Column(
56 String(16), nullable=False, index=True
57 ) # For compatibility matching
58 query_hash_list = Column(
59 JSON, nullable=False
60 ) # List of query hashes to avoid duplication
62 # Configuration
63 search_config = Column(
64 JSON, nullable=False
65 ) # Complete search configuration
66 evaluation_config = Column(JSON, nullable=False) # Evaluation settings
67 datasets_config = Column(
68 JSON, nullable=False
69 ) # Dataset selection and quantities
71 # Status and timing
72 status = Column(
73 Enum(BenchmarkStatus), default=BenchmarkStatus.PENDING, nullable=False
74 )
75 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
76 updated_at = Column(
77 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
78 )
79 start_time = Column(UtcDateTime, nullable=True)
80 end_time = Column(UtcDateTime, nullable=True)
82 # Progress tracking
83 total_examples = Column(Integer, default=0, nullable=False)
84 completed_examples = Column(Integer, default=0, nullable=False)
85 failed_examples = Column(Integer, default=0, nullable=False)
87 # Results summary
88 overall_accuracy = Column(Float, nullable=True)
89 processing_rate = Column(Float, nullable=True) # Examples per minute
91 # Error handling
92 error_message = Column(Text, nullable=True)
94 # Relationships
95 results = relationship(
96 "BenchmarkResult",
97 back_populates="benchmark_run",
98 cascade="all, delete-orphan",
99 lazy="dynamic",
100 )
101 progress_updates = relationship(
102 "BenchmarkProgress",
103 back_populates="benchmark_run",
104 cascade="all, delete-orphan",
105 lazy="dynamic",
106 )
108 # Indexes for performance and extend existing
109 __table_args__ = (
110 Index("idx_benchmark_runs_config_hash", "config_hash"),
111 Index("idx_benchmark_runs_status_created", "status", "created_at"),
112 {"extend_existing": True},
113 )
116class BenchmarkResult(Base):
117 """Individual benchmark result for a single question."""
119 __tablename__ = "benchmark_results"
121 id = Column(Integer, primary_key=True, index=True)
123 # Foreign key
124 benchmark_run_id = Column(
125 Integer,
126 ForeignKey("benchmark_runs.id", ondelete="CASCADE"),
127 nullable=False,
128 index=True,
129 )
131 # Question identification
132 example_id = Column(String(255), nullable=False) # Original dataset ID
133 query_hash = Column(
134 String(32), nullable=False, index=True
135 ) # For deduplication
136 dataset_type = Column(Enum(DatasetType), nullable=False)
137 research_id = Column(
138 String(36), nullable=True, index=True
139 ) # UUID string or converted integer
141 # Question and answer
142 question = Column(Text, nullable=False)
143 correct_answer = Column(Text, nullable=False)
145 # Research results
146 response = Column(Text, nullable=True)
147 extracted_answer = Column(Text, nullable=True)
148 confidence = Column(String(10), nullable=True)
149 processing_time = Column(Float, nullable=True)
150 sources = Column(JSON, nullable=True)
152 # Evaluation results
153 is_correct = Column(Boolean, nullable=True)
154 graded_confidence = Column(String(10), nullable=True)
155 grader_response = Column(Text, nullable=True)
157 # Timestamps
158 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
159 completed_at = Column(UtcDateTime, nullable=True)
161 # Error handling
162 research_error = Column(Text, nullable=True)
163 evaluation_error = Column(Text, nullable=True)
165 # Additional metadata
166 task_index = Column(Integer, nullable=True) # Order in processing
167 result_metadata = Column(JSON, nullable=True) # Additional data
169 # Relationships
170 benchmark_run = relationship("BenchmarkRun", back_populates="results")
172 # Indexes for performance
173 __table_args__ = (
174 Index(
175 "idx_benchmark_results_run_dataset",
176 "benchmark_run_id",
177 "dataset_type",
178 ),
179 Index("idx_benchmark_results_query_hash", "query_hash"),
180 Index("idx_benchmark_results_completed", "completed_at"),
181 UniqueConstraint(
182 "benchmark_run_id", "query_hash", name="uix_run_query"
183 ),
184 {"extend_existing": True},
185 )
188class BenchmarkConfig(Base):
189 """Saved benchmark configurations for reuse."""
191 __tablename__ = "benchmark_configs"
193 id = Column(Integer, primary_key=True, index=True)
195 # Configuration details
196 name = Column(String(255), nullable=False)
197 description = Column(Text, nullable=True)
198 config_hash = Column(String(16), nullable=False, index=True)
200 # Configuration data
201 search_config = Column(JSON, nullable=False)
202 evaluation_config = Column(JSON, nullable=False)
203 datasets_config = Column(JSON, nullable=False)
205 # Metadata
206 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
207 updated_at = Column(
208 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
209 )
210 is_default = Column(Boolean, default=False, nullable=False)
211 is_public = Column(Boolean, default=True, nullable=False)
213 # Usage tracking
214 usage_count = Column(Integer, default=0, nullable=False)
215 last_used = Column(UtcDateTime, nullable=True)
217 # Performance data (if available)
218 best_accuracy = Column(Float, nullable=True)
219 avg_processing_rate = Column(Float, nullable=True)
221 # Indexes
222 __table_args__ = (
223 Index("idx_benchmark_configs_name", "name"),
224 Index("idx_benchmark_configs_hash", "config_hash"),
225 Index("idx_benchmark_configs_default", "is_default"),
226 {"extend_existing": True},
227 )
230class BenchmarkProgress(Base):
231 """Real-time progress tracking for benchmark runs."""
233 __tablename__ = "benchmark_progress"
235 id = Column(Integer, primary_key=True, index=True)
237 # Foreign key
238 benchmark_run_id = Column(
239 Integer,
240 ForeignKey("benchmark_runs.id", ondelete="CASCADE"),
241 nullable=False,
242 index=True,
243 )
245 # Progress data
246 timestamp = Column(UtcDateTime, default=utcnow(), nullable=False)
247 completed_examples = Column(Integer, nullable=False)
248 total_examples = Column(Integer, nullable=False)
250 # Accuracy tracking
251 overall_accuracy = Column(Float, nullable=True)
252 dataset_accuracies = Column(JSON, nullable=True) # Per-dataset accuracy
254 # Performance metrics
255 processing_rate = Column(Float, nullable=True) # Examples per minute
256 estimated_completion = Column(UtcDateTime, nullable=True)
258 # Current status
259 current_dataset = Column(Enum(DatasetType), nullable=True)
260 current_example_id = Column(String(255), nullable=True)
262 # Additional metrics
263 memory_usage = Column(Float, nullable=True) # MB
264 cpu_usage = Column(Float, nullable=True) # Percentage
266 # Relationships
267 benchmark_run = relationship(
268 "BenchmarkRun", back_populates="progress_updates"
269 )
271 # Indexes for real-time queries
272 __table_args__ = (
273 Index(
274 "idx_benchmark_progress_run_time", "benchmark_run_id", "timestamp"
275 ),
276 {"extend_existing": True},
277 )