Coverage for src / local_deep_research / database / models / benchmark.py: 100%

101 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1"""Database models for benchmark system.""" 

2 

3import enum 

4 

5from sqlalchemy import ( 

6 JSON, 

7 Boolean, 

8 Column, 

9 Enum, 

10 Float, 

11 ForeignKey, 

12 Index, 

13 Integer, 

14 String, 

15 Text, 

16 UniqueConstraint, 

17) 

18from sqlalchemy.orm import relationship 

19from sqlalchemy_utc import UtcDateTime, utcnow 

20 

21 

22# Use the same base as the main app 

23from . import Base 

24 

25 

26class BenchmarkStatus(enum.Enum): 

27 """Status of a benchmark run.""" 

28 

29 PENDING = "pending" 

30 IN_PROGRESS = "in_progress" 

31 COMPLETED = "completed" 

32 FAILED = "failed" 

33 CANCELLED = "cancelled" 

34 PAUSED = "paused" 

35 

36 

37class DatasetType(enum.Enum): 

38 """Supported dataset types.""" 

39 

40 SIMPLEQA = "simpleqa" 

41 BROWSECOMP = "browsecomp" 

42 XBENCH_DEEPSEARCH = "xbench_deepsearch" 

43 CUSTOM = "custom" 

44 

45 

46class BenchmarkRun(Base): 

47 """Main benchmark run metadata.""" 

48 

49 __tablename__ = "benchmark_runs" 

50 

51 id = Column(Integer, primary_key=True, index=True) 

52 

53 # Run identification 

54 run_name = Column(String(255), nullable=True) # User-friendly name 

55 config_hash = Column( 

56 String(16), nullable=False, index=True 

57 ) # For compatibility matching 

58 query_hash_list = Column( 

59 JSON, nullable=False 

60 ) # List of query hashes to avoid duplication 

61 

62 # Configuration 

63 search_config = Column( 

64 JSON, nullable=False 

65 ) # Complete search configuration 

66 evaluation_config = Column(JSON, nullable=False) # Evaluation settings 

67 datasets_config = Column( 

68 JSON, nullable=False 

69 ) # Dataset selection and quantities 

70 

71 # Status and timing 

72 status = Column( 

73 Enum(BenchmarkStatus), default=BenchmarkStatus.PENDING, nullable=False 

74 ) 

75 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

76 updated_at = Column( 

77 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

78 ) 

79 start_time = Column(UtcDateTime, nullable=True) 

80 end_time = Column(UtcDateTime, nullable=True) 

81 

82 # Progress tracking 

83 total_examples = Column(Integer, default=0, nullable=False) 

84 completed_examples = Column(Integer, default=0, nullable=False) 

85 failed_examples = Column(Integer, default=0, nullable=False) 

86 

87 # Results summary 

88 overall_accuracy = Column(Float, nullable=True) 

89 processing_rate = Column(Float, nullable=True) # Examples per minute 

90 

91 # Error handling 

92 error_message = Column(Text, nullable=True) 

93 

94 # Relationships 

95 results = relationship( 

96 "BenchmarkResult", 

97 back_populates="benchmark_run", 

98 cascade="all, delete-orphan", 

99 lazy="dynamic", 

100 ) 

101 progress_updates = relationship( 

102 "BenchmarkProgress", 

103 back_populates="benchmark_run", 

104 cascade="all, delete-orphan", 

105 lazy="dynamic", 

106 ) 

107 

108 # Indexes for performance and extend existing 

109 __table_args__ = ( 

110 Index("idx_benchmark_runs_config_hash", "config_hash"), 

111 Index("idx_benchmark_runs_status_created", "status", "created_at"), 

112 {"extend_existing": True}, 

113 ) 

114 

115 

116class BenchmarkResult(Base): 

117 """Individual benchmark result for a single question.""" 

118 

119 __tablename__ = "benchmark_results" 

120 

121 id = Column(Integer, primary_key=True, index=True) 

122 

123 # Foreign key 

124 benchmark_run_id = Column( 

125 Integer, 

126 ForeignKey("benchmark_runs.id", ondelete="CASCADE"), 

127 nullable=False, 

128 index=True, 

129 ) 

130 

131 # Question identification 

132 example_id = Column(String(255), nullable=False) # Original dataset ID 

133 query_hash = Column( 

134 String(32), nullable=False, index=True 

135 ) # For deduplication 

136 dataset_type = Column(Enum(DatasetType), nullable=False) 

137 research_id = Column( 

138 String(36), nullable=True, index=True 

139 ) # UUID string or converted integer 

140 

141 # Question and answer 

142 question = Column(Text, nullable=False) 

143 correct_answer = Column(Text, nullable=False) 

144 

145 # Research results 

146 response = Column(Text, nullable=True) 

147 extracted_answer = Column(Text, nullable=True) 

148 confidence = Column(String(10), nullable=True) 

149 processing_time = Column(Float, nullable=True) 

150 sources = Column(JSON, nullable=True) 

151 

152 # Evaluation results 

153 is_correct = Column(Boolean, nullable=True) 

154 graded_confidence = Column(String(10), nullable=True) 

155 grader_response = Column(Text, nullable=True) 

156 

157 # Timestamps 

158 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

159 completed_at = Column(UtcDateTime, nullable=True) 

160 

161 # Error handling 

162 research_error = Column(Text, nullable=True) 

163 evaluation_error = Column(Text, nullable=True) 

164 

165 # Additional metadata 

166 task_index = Column(Integer, nullable=True) # Order in processing 

167 result_metadata = Column(JSON, nullable=True) # Additional data 

168 

169 # Relationships 

170 benchmark_run = relationship("BenchmarkRun", back_populates="results") 

171 

172 # Indexes for performance 

173 __table_args__ = ( 

174 Index( 

175 "idx_benchmark_results_run_dataset", 

176 "benchmark_run_id", 

177 "dataset_type", 

178 ), 

179 Index("idx_benchmark_results_query_hash", "query_hash"), 

180 Index("idx_benchmark_results_completed", "completed_at"), 

181 UniqueConstraint( 

182 "benchmark_run_id", "query_hash", name="uix_run_query" 

183 ), 

184 {"extend_existing": True}, 

185 ) 

186 

187 

188class BenchmarkConfig(Base): 

189 """Saved benchmark configurations for reuse.""" 

190 

191 __tablename__ = "benchmark_configs" 

192 

193 id = Column(Integer, primary_key=True, index=True) 

194 

195 # Configuration details 

196 name = Column(String(255), nullable=False) 

197 description = Column(Text, nullable=True) 

198 config_hash = Column(String(16), nullable=False, index=True) 

199 

200 # Configuration data 

201 search_config = Column(JSON, nullable=False) 

202 evaluation_config = Column(JSON, nullable=False) 

203 datasets_config = Column(JSON, nullable=False) 

204 

205 # Metadata 

206 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

207 updated_at = Column( 

208 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

209 ) 

210 is_default = Column(Boolean, default=False, nullable=False) 

211 is_public = Column(Boolean, default=True, nullable=False) 

212 

213 # Usage tracking 

214 usage_count = Column(Integer, default=0, nullable=False) 

215 last_used = Column(UtcDateTime, nullable=True) 

216 

217 # Performance data (if available) 

218 best_accuracy = Column(Float, nullable=True) 

219 avg_processing_rate = Column(Float, nullable=True) 

220 

221 # Indexes 

222 __table_args__ = ( 

223 Index("idx_benchmark_configs_name", "name"), 

224 Index("idx_benchmark_configs_hash", "config_hash"), 

225 Index("idx_benchmark_configs_default", "is_default"), 

226 {"extend_existing": True}, 

227 ) 

228 

229 

230class BenchmarkProgress(Base): 

231 """Real-time progress tracking for benchmark runs.""" 

232 

233 __tablename__ = "benchmark_progress" 

234 

235 id = Column(Integer, primary_key=True, index=True) 

236 

237 # Foreign key 

238 benchmark_run_id = Column( 

239 Integer, 

240 ForeignKey("benchmark_runs.id", ondelete="CASCADE"), 

241 nullable=False, 

242 index=True, 

243 ) 

244 

245 # Progress data 

246 timestamp = Column(UtcDateTime, default=utcnow(), nullable=False) 

247 completed_examples = Column(Integer, nullable=False) 

248 total_examples = Column(Integer, nullable=False) 

249 

250 # Accuracy tracking 

251 overall_accuracy = Column(Float, nullable=True) 

252 dataset_accuracies = Column(JSON, nullable=True) # Per-dataset accuracy 

253 

254 # Performance metrics 

255 processing_rate = Column(Float, nullable=True) # Examples per minute 

256 estimated_completion = Column(UtcDateTime, nullable=True) 

257 

258 # Current status 

259 current_dataset = Column(Enum(DatasetType), nullable=True) 

260 current_example_id = Column(String(255), nullable=True) 

261 

262 # Additional metrics 

263 memory_usage = Column(Float, nullable=True) # MB 

264 cpu_usage = Column(Float, nullable=True) # Percentage 

265 

266 # Relationships 

267 benchmark_run = relationship( 

268 "BenchmarkRun", back_populates="progress_updates" 

269 ) 

270 

271 # Indexes for real-time queries 

272 __table_args__ = ( 

273 Index( 

274 "idx_benchmark_progress_run_time", "benchmark_run_id", "timestamp" 

275 ), 

276 {"extend_existing": True}, 

277 )