Coverage for src/local_deep_research/database/models/benchmark.py: 100%

1"""Database models for benchmark system."""

3import enum

5from sqlalchemy import (

6 JSON,

7 Boolean,

8 Column,

9 Enum,

10 Float,

11 ForeignKey,

12 Index,

13 Integer,

14 String,

15 Text,

16 UniqueConstraint,

17)

18from sqlalchemy.orm import relationship

19from sqlalchemy_utc import UtcDateTime, utcnow

22# Use the same base as the main app

23from . import Base

26class BenchmarkStatus(enum.Enum):

27 """Status of a benchmark run."""

29 PENDING = "pending"

30 IN_PROGRESS = "in_progress"

31 COMPLETED = "completed"

32 FAILED = "failed"

33 CANCELLED = "cancelled"

34 PAUSED = "paused"

37class DatasetType(enum.Enum):

38 """Supported dataset types."""

40 SIMPLEQA = "simpleqa"

41 BROWSECOMP = "browsecomp"

42 XBENCH_DEEPSEARCH = "xbench_deepsearch"

43 CUSTOM = "custom"

46class BenchmarkRun(Base):

47 """Main benchmark run metadata."""

49 __tablename__ = "benchmark_runs"

51 id = Column(Integer, primary_key=True, index=True)

53 # Run identification

54 run_name = Column(String(255), nullable=True) # User-friendly name

55 config_hash = Column(

56 String(16), nullable=False, index=True

57 ) # For compatibility matching

58 query_hash_list = Column(

59 JSON, nullable=False

60 ) # List of query hashes to avoid duplication

62 # Configuration

63 search_config = Column(

64 JSON, nullable=False

65 ) # Complete search configuration

66 evaluation_config = Column(JSON, nullable=False) # Evaluation settings

67 datasets_config = Column(

68 JSON, nullable=False

69 ) # Dataset selection and quantities

71 # Status and timing

72 status = Column(

73 Enum(BenchmarkStatus), default=BenchmarkStatus.PENDING, nullable=False

74 )

75 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

76 updated_at = Column(

77 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

78 )

79 start_time = Column(UtcDateTime, nullable=True)

80 end_time = Column(UtcDateTime, nullable=True)

82 # Progress tracking

83 total_examples = Column(Integer, default=0, nullable=False)

84 completed_examples = Column(Integer, default=0, nullable=False)

85 failed_examples = Column(Integer, default=0, nullable=False)

87 # Results summary

88 overall_accuracy = Column(Float, nullable=True)

89 processing_rate = Column(Float, nullable=True) # Examples per minute

91 # Error handling

92 error_message = Column(Text, nullable=True)

94 # Relationships

95 results = relationship(

96 "BenchmarkResult",

97 back_populates="benchmark_run",

98 cascade="all, delete-orphan",

99 lazy="dynamic",

100 )

101 progress_updates = relationship(

102 "BenchmarkProgress",

103 back_populates="benchmark_run",

104 cascade="all, delete-orphan",

105 lazy="dynamic",

106 )

107

108 # Indexes for performance and extend existing

109 __table_args__ = (

110 Index("idx_benchmark_runs_config_hash", "config_hash"),

111 Index("idx_benchmark_runs_status_created", "status", "created_at"),

112 {"extend_existing": True},

113 )

114

115

116class BenchmarkResult(Base):

117 """Individual benchmark result for a single question."""

118

119 __tablename__ = "benchmark_results"

120

121 id = Column(Integer, primary_key=True, index=True)

122

123 # Foreign key

124 benchmark_run_id = Column(

125 Integer,

126 ForeignKey("benchmark_runs.id", ondelete="CASCADE"),

127 nullable=False,

128 index=True,

129 )

130

131 # Question identification

132 example_id = Column(String(255), nullable=False) # Original dataset ID

133 query_hash = Column(

134 String(32), nullable=False, index=True

135 ) # For deduplication

136 dataset_type = Column(Enum(DatasetType), nullable=False)

137 research_id = Column(

138 String(36), nullable=True, index=True

139 ) # UUID string or converted integer

140

141 # Question and answer

142 question = Column(Text, nullable=False)

143 correct_answer = Column(Text, nullable=False)

144

145 # Research results

146 response = Column(Text, nullable=True)

147 extracted_answer = Column(Text, nullable=True)

148 confidence = Column(String(10), nullable=True)

149 processing_time = Column(Float, nullable=True)

150 sources = Column(JSON, nullable=True)

151

152 # Evaluation results

153 is_correct = Column(Boolean, nullable=True)

154 graded_confidence = Column(String(10), nullable=True)

155 grader_response = Column(Text, nullable=True)

156

157 # Timestamps

158 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

159 completed_at = Column(UtcDateTime, nullable=True)

160

161 # Error handling

162 research_error = Column(Text, nullable=True)

163 evaluation_error = Column(Text, nullable=True)

164

165 # Additional metadata

166 task_index = Column(Integer, nullable=True) # Order in processing

167 result_metadata = Column(JSON, nullable=True) # Additional data

168

169 # Relationships

170 benchmark_run = relationship("BenchmarkRun", back_populates="results")

171

172 # Indexes for performance

173 __table_args__ = (

174 Index(

175 "idx_benchmark_results_run_dataset",

176 "benchmark_run_id",

177 "dataset_type",

178 ),

179 Index("idx_benchmark_results_query_hash", "query_hash"),

180 Index("idx_benchmark_results_completed", "completed_at"),

181 UniqueConstraint(

182 "benchmark_run_id", "query_hash", name="uix_run_query"

183 ),

184 {"extend_existing": True},

185 )

186

187

188class BenchmarkConfig(Base):

189 """Saved benchmark configurations for reuse."""

190

191 __tablename__ = "benchmark_configs"

192

193 id = Column(Integer, primary_key=True, index=True)

194

195 # Configuration details

196 name = Column(String(255), nullable=False)

197 description = Column(Text, nullable=True)

198 config_hash = Column(String(16), nullable=False, index=True)

199

200 # Configuration data

201 search_config = Column(JSON, nullable=False)

202 evaluation_config = Column(JSON, nullable=False)

203 datasets_config = Column(JSON, nullable=False)

204

205 # Metadata

206 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

207 updated_at = Column(

208 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

209 )

210 is_default = Column(Boolean, default=False, nullable=False)

211 is_public = Column(Boolean, default=True, nullable=False)

212

213 # Usage tracking

214 usage_count = Column(Integer, default=0, nullable=False)

215 last_used = Column(UtcDateTime, nullable=True)

216

217 # Performance data (if available)

218 best_accuracy = Column(Float, nullable=True)

219 avg_processing_rate = Column(Float, nullable=True)

220

221 # Indexes

222 __table_args__ = (

223 Index("idx_benchmark_configs_name", "name"),

224 Index("idx_benchmark_configs_hash", "config_hash"),

225 Index("idx_benchmark_configs_default", "is_default"),

226 {"extend_existing": True},

227 )

228

229

230class BenchmarkProgress(Base):

231 """Real-time progress tracking for benchmark runs."""

232

233 __tablename__ = "benchmark_progress"

234

235 id = Column(Integer, primary_key=True, index=True)

236

237 # Foreign key

238 benchmark_run_id = Column(

239 Integer,

240 ForeignKey("benchmark_runs.id", ondelete="CASCADE"),

241 nullable=False,

242 index=True,

243 )

244

245 # Progress data

246 timestamp = Column(UtcDateTime, default=utcnow(), nullable=False)

247 completed_examples = Column(Integer, nullable=False)

248 total_examples = Column(Integer, nullable=False)

249

250 # Accuracy tracking

251 overall_accuracy = Column(Float, nullable=True)

252 dataset_accuracies = Column(JSON, nullable=True) # Per-dataset accuracy

253

254 # Performance metrics

255 processing_rate = Column(Float, nullable=True) # Examples per minute

256 estimated_completion = Column(UtcDateTime, nullable=True)

257

258 # Current status

259 current_dataset = Column(Enum(DatasetType), nullable=True)

260 current_example_id = Column(String(255), nullable=True)

261

262 # Additional metrics

263 memory_usage = Column(Float, nullable=True) # MB

264 cpu_usage = Column(Float, nullable=True) # Percentage

265

266 # Relationships

267 benchmark_run = relationship(

268 "BenchmarkRun", back_populates="progress_updates"

269 )

270

271 # Indexes for real-time queries

272 __table_args__ = (

273 Index(

274 "idx_benchmark_progress_run_time", "benchmark_run_id", "timestamp"

275 ),

276 {"extend_existing": True},

277 )

Coverage for src / local_deep_research / database / models / benchmark.py: 100%

101 statements