Coverage for src/local_deep_research/web/routes/context_overflow_api.py: 99%

112 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""API endpoints for context overflow analytics.""" 

2 

3from flask import Blueprint, jsonify, request, session as flask_session 

4from datetime import datetime, timedelta, timezone 

5from sqlalchemy import func, desc 

6from loguru import logger 

7 

8from ...database.session_context import get_user_db_session 

9from ...database.models import TokenUsage 

10from ...metrics.query_utils import get_context_overflow_truncation_summary 

11from ...settings import SettingsManager 

12from ..auth.decorators import login_required 

13 

14context_overflow_bp = Blueprint("context_overflow_api", __name__) 

15 

16# NOTE: Routes use flask_session["username"] (not .get()) intentionally. 

17# @login_required guarantees the key exists; direct access fails fast 

18# if the decorator is ever removed. 

19 

20 

21@context_overflow_bp.route("/api/context-overflow", methods=["GET"]) 

22@login_required 

23def get_context_overflow_metrics(): 

24 """Get context overflow metrics for the current user.""" 

25 try: 

26 # Get username from session 

27 username = flask_session["username"] 

28 

29 # Get time period from query params (whitelist valid values) 

30 VALID_PERIODS = {"7d", "30d", "3m", "1y", "all"} 

31 period = request.args.get("period", "30d") 

32 if period not in VALID_PERIODS: 

33 period = "30d" 

34 

35 # Pagination params for all_requests 

36 page = max(1, request.args.get("page", 1, type=int)) 

37 per_page = request.args.get("per_page", 50, type=int) 

38 per_page = max(1, min(per_page, 500)) 

39 

40 # Calculate date filter (use timezone-aware datetime) 

41 start_date = None 

42 if period != "all": 

43 now = datetime.now(timezone.utc) 

44 if period == "7d": 

45 start_date = now - timedelta(days=7) 

46 elif period == "30d": 

47 start_date = now - timedelta(days=30) 

48 elif period == "3m": 

49 start_date = now - timedelta(days=90) 

50 elif period == "1y": 50 ↛ 53line 50 didn't jump to line 53 because the condition on line 50 was always true

51 start_date = now - timedelta(days=365) 

52 

53 with get_user_db_session(username) as session: 

54 # Truncation summary — shared with /metrics/api/metrics so the 

55 # main dashboard's at-a-glance numbers cannot disagree with this 

56 # endpoint's deep-dive. Helper internally uses 

57 # get_time_filter_condition, equivalent to the start_date below. 

58 summary = get_context_overflow_truncation_summary(session, period) 

59 total_requests = summary["total_requests"] 

60 requests_with_context = summary["requests_with_context"] 

61 truncated_requests = summary["truncated_requests"] 

62 truncation_rate = summary["truncation_rate"] 

63 avg_tokens_truncated = summary["avg_tokens_truncated"] 

64 

65 # Base query — kept for downstream phase / chart_data / all_requests 

66 # aggregations that share the same time window. 

67 query = session.query(TokenUsage) 

68 if start_date: 

69 query = query.filter(TokenUsage.timestamp >= start_date) 

70 

71 token_summary = { 

72 "total_requests": total_requests, 

73 "total_tokens": summary["total_tokens"], 

74 "total_prompt_tokens": summary["total_prompt_tokens"], 

75 "total_completion_tokens": summary["total_completion_tokens"], 

76 "avg_prompt_tokens": round(summary["avg_prompt_tokens"], 0), 

77 "avg_completion_tokens": round( 

78 summary["avg_completion_tokens"], 0 

79 ), 

80 "max_prompt_tokens": summary["max_prompt_tokens"], 

81 } 

82 

83 # --- Model token stats (always populated, no context_limit filter) --- 

84 model_token_query = ( 

85 query.with_entities( 

86 TokenUsage.model_name, 

87 TokenUsage.model_provider, 

88 func.count(TokenUsage.id).label("total_requests"), 

89 func.coalesce(func.sum(TokenUsage.total_tokens), 0).label( 

90 "total_tokens" 

91 ), 

92 func.min(TokenUsage.prompt_tokens).label("min_prompt"), 

93 func.avg(TokenUsage.prompt_tokens).label("avg_prompt"), 

94 func.max(TokenUsage.prompt_tokens).label("max_prompt"), 

95 func.avg(TokenUsage.response_time_ms).label( 

96 "avg_response_time_ms" 

97 ), 

98 ) 

99 .group_by(TokenUsage.model_name, TokenUsage.model_provider) 

100 .all() 

101 ) 

102 

103 model_token_stats = [ 

104 { 

105 "model": row.model_name, 

106 "provider": row.model_provider, 

107 "total_requests": row.total_requests, 

108 "total_tokens": int(row.total_tokens or 0), 

109 "min_prompt": int(row.min_prompt or 0), 

110 "avg_prompt": round(row.avg_prompt or 0, 0), 

111 "max_prompt": int(row.max_prompt or 0), 

112 "avg_response_time_ms": round( 

113 row.avg_response_time_ms or 0, 0 

114 ), 

115 } 

116 for row in model_token_query 

117 ] 

118 

119 # --- Phase breakdown (always populated, no context_limit filter) --- 

120 phase_query = ( 

121 query.with_entities( 

122 TokenUsage.research_phase, 

123 func.count(TokenUsage.id).label("count"), 

124 func.coalesce(func.sum(TokenUsage.total_tokens), 0).label( 

125 "total_tokens" 

126 ), 

127 func.avg(TokenUsage.total_tokens).label("avg_tokens"), 

128 ) 

129 .group_by(TokenUsage.research_phase) 

130 .all() 

131 ) 

132 

133 phase_breakdown = [ 

134 { 

135 "phase": row.research_phase or "unknown", 

136 "count": row.count, 

137 "total_tokens": int(row.total_tokens or 0), 

138 "avg_tokens": round(row.avg_tokens or 0, 0), 

139 } 

140 for row in phase_query 

141 ] 

142 

143 # Get context limit distribution by model 

144 context_limits = session.query( 

145 TokenUsage.model_name, 

146 TokenUsage.context_limit, 

147 func.count(TokenUsage.id).label("count"), 

148 ).filter(TokenUsage.context_limit.isnot(None)) 

149 

150 if start_date: 

151 context_limits = context_limits.filter( 

152 TokenUsage.timestamp >= start_date 

153 ) 

154 

155 context_limits = context_limits.group_by( 

156 TokenUsage.model_name, TokenUsage.context_limit 

157 ).all() 

158 

159 # Get recent truncated requests 

160 recent_truncated = ( 

161 query.filter(TokenUsage.context_truncated.is_(True)) 

162 .order_by(desc(TokenUsage.timestamp)) 

163 .limit(20) 

164 .all() 

165 ) 

166 

167 # Get time series data for chart - include all records 

168 # (even those without context_limit for OpenRouter models) 

169 time_series_query = query.order_by(TokenUsage.timestamp) 

170 

171 if start_date: 

172 # For shorter periods, get all data points (capped at 1000) 

173 if period in ["7d", "30d"]: 

174 time_series_data = time_series_query.limit(1000).all() 

175 else: 

176 # For longer periods, sample data 

177 time_series_data = time_series_query.limit(500).all() 

178 else: 

179 time_series_data = time_series_query.limit(1000).all() 

180 

181 # Format time series for chart 

182 chart_data = [] 

183 for usage in time_series_data: 

184 # Calculate original tokens (before truncation) 

185 ollama_used = ( 

186 usage.ollama_prompt_eval_count 

187 ) # What Ollama actually processed 

188 actual_prompt = ollama_used or usage.prompt_tokens 

189 tokens_truncated = usage.tokens_truncated or 0 

190 original_tokens = ( 

191 actual_prompt + tokens_truncated 

192 if usage.context_truncated 

193 else actual_prompt 

194 ) 

195 

196 chart_data.append( 

197 { 

198 "timestamp": usage.timestamp.isoformat(), 

199 "research_id": usage.research_id, 

200 "prompt_tokens": usage.prompt_tokens, # From our standard token counting 

201 "completion_tokens": usage.completion_tokens, 

202 "ollama_prompt_tokens": ollama_used, # What Ollama actually used (may be capped) 

203 "original_prompt_tokens": original_tokens, # What was originally requested (before truncation) 

204 "context_limit": usage.context_limit, 

205 "truncated": bool(usage.context_truncated), 

206 "tokens_truncated": tokens_truncated, 

207 "model": usage.model_name, 

208 "provider": usage.model_provider, 

209 "research_phase": usage.research_phase, 

210 "response_time_ms": usage.response_time_ms, 

211 } 

212 ) 

213 

214 # Get model-specific truncation stats 

215 model_stats = session.query( 

216 TokenUsage.model_name, 

217 TokenUsage.model_provider, 

218 func.count(TokenUsage.id).label("total_requests"), 

219 func.sum(TokenUsage.context_truncated).label("truncated_count"), 

220 func.avg(TokenUsage.context_limit).label("avg_context_limit"), 

221 ).filter(TokenUsage.context_limit.isnot(None)) 

222 

223 if start_date: 

224 model_stats = model_stats.filter( 

225 TokenUsage.timestamp >= start_date 

226 ) 

227 

228 model_stats = model_stats.group_by( 

229 TokenUsage.model_name, TokenUsage.model_provider 

230 ).all() 

231 

232 # --- Paginated all_requests --- 

233 all_requests_query = query.order_by(desc(TokenUsage.timestamp)) 

234 all_requests_total = all_requests_query.count() 

235 all_requests_pages = ( 

236 (all_requests_total + per_page - 1) // per_page 

237 if all_requests_total > 0 

238 else 1 

239 ) 

240 all_requests_data = ( 

241 all_requests_query.offset((page - 1) * per_page) 

242 .limit(per_page) 

243 .all() 

244 ) 

245 

246 # Format response 

247 response = { 

248 "status": "success", 

249 "overview": { 

250 "total_requests": total_requests, 

251 "requests_with_context_data": requests_with_context, 

252 "truncated_requests": truncated_requests, 

253 "truncation_rate": round(truncation_rate, 2), 

254 "avg_tokens_truncated": round(avg_tokens_truncated, 0) 

255 if avg_tokens_truncated 

256 else 0, 

257 }, 

258 "token_summary": token_summary, 

259 "model_token_stats": model_token_stats, 

260 "phase_breakdown": phase_breakdown, 

261 "context_limits": [ 

262 {"model": model, "limit": limit, "count": count} 

263 for model, limit, count in context_limits 

264 ], 

265 "model_stats": [ 

266 { 

267 "model": stat.model_name, 

268 "provider": stat.model_provider, 

269 "total_requests": stat.total_requests, 

270 "truncated_count": int(stat.truncated_count or 0), 

271 "truncation_rate": round( 

272 (stat.truncated_count or 0) 

273 / stat.total_requests 

274 * 100, 

275 2, 

276 ) 

277 if stat.total_requests > 0 

278 else 0, 

279 "avg_context_limit": round(stat.avg_context_limit, 0) 

280 if stat.avg_context_limit 

281 else None, 

282 } 

283 for stat in model_stats 

284 ], 

285 "recent_truncated": [ 

286 { 

287 "timestamp": req.timestamp.isoformat(), 

288 "research_id": req.research_id, 

289 "model": req.model_name, 

290 "prompt_tokens": req.prompt_tokens, # Standard token count 

291 "ollama_tokens": req.ollama_prompt_eval_count, # What Ollama actually used 

292 "original_tokens": ( 

293 req.ollama_prompt_eval_count or req.prompt_tokens 

294 ) 

295 + (req.tokens_truncated or 0), # What was requested 

296 "context_limit": req.context_limit, 

297 "tokens_truncated": req.tokens_truncated, 

298 "truncation_ratio": req.truncation_ratio, 

299 "research_query": req.research_query, 

300 } 

301 for req in recent_truncated 

302 ], 

303 "chart_data": chart_data, 

304 "all_requests": [ 

305 { 

306 "timestamp": req.timestamp.isoformat(), 

307 "research_id": req.research_id, 

308 "model": req.model_name, 

309 "provider": req.model_provider, 

310 "prompt_tokens": req.prompt_tokens, 

311 "completion_tokens": req.completion_tokens, 

312 "total_tokens": req.total_tokens, 

313 "context_limit": req.context_limit, 

314 "context_truncated": bool(req.context_truncated), 

315 "tokens_truncated": req.tokens_truncated or 0, 

316 "truncation_ratio": round(req.truncation_ratio * 100, 2) 

317 if req.truncation_ratio 

318 else 0, 

319 "ollama_prompt_eval_count": req.ollama_prompt_eval_count, 

320 "research_query": req.research_query, 

321 "research_phase": req.research_phase, 

322 } 

323 for req in all_requests_data 

324 ], 

325 "pagination": { 

326 "page": page, 

327 "per_page": per_page, 

328 "total_count": all_requests_total, 

329 "total_pages": all_requests_pages, 

330 }, 

331 "current_context_window": SettingsManager(session).get_setting( 

332 "llm.local_context_window_size" 

333 ), 

334 } 

335 

336 return jsonify(response) 

337 

338 except Exception: 

339 logger.exception("Error getting context overflow metrics") 

340 return jsonify( 

341 { 

342 "status": "error", 

343 "message": "Failed to load context overflow metrics", 

344 } 

345 ), 500 

346 

347 

348@context_overflow_bp.route( 

349 "/api/research/<string:research_id>/context-overflow", methods=["GET"] 

350) 

351@login_required 

352def get_research_context_overflow(research_id): 

353 """Get context overflow metrics for a specific research.""" 

354 try: 

355 with get_user_db_session() as session: 

356 # Get all token usage for this research 

357 token_usage = ( 

358 session.query(TokenUsage) 

359 .filter(TokenUsage.research_id == research_id) 

360 .order_by(TokenUsage.timestamp) 

361 .all() 

362 ) 

363 

364 if not token_usage: 

365 return jsonify( 

366 { 

367 "status": "success", 

368 "data": { 

369 "overview": { 

370 "total_requests": 0, 

371 "total_tokens": 0, 

372 "context_limit": None, 

373 "max_tokens_used": 0, 

374 "truncation_occurred": False, 

375 }, 

376 "requests": [], 

377 }, 

378 } 

379 ) 

380 

381 # Calculate overview metrics 

382 total_tokens = sum(req.total_tokens or 0 for req in token_usage) 

383 total_prompt = sum(req.prompt_tokens or 0 for req in token_usage) 

384 total_completion = sum( 

385 req.completion_tokens or 0 for req in token_usage 

386 ) 

387 

388 # Get context limit (should be same for all requests in a research) 

389 context_limit = next( 

390 (req.context_limit for req in token_usage if req.context_limit), 

391 None, 

392 ) 

393 

394 # Check for truncation 

395 truncated_requests = [ 

396 req for req in token_usage if req.context_truncated 

397 ] 

398 max_tokens_used = max( 

399 (req.prompt_tokens or 0) for req in token_usage 

400 ) 

401 

402 # Get token usage by phase 

403 phase_stats = {} 

404 for req in token_usage: 

405 phase = req.research_phase or "unknown" 

406 if phase not in phase_stats: 

407 phase_stats[phase] = { 

408 "count": 0, 

409 "prompt_tokens": 0, 

410 "completion_tokens": 0, 

411 "total_tokens": 0, 

412 "truncated_count": 0, 

413 } 

414 phase_stats[phase]["count"] += 1 

415 phase_stats[phase]["prompt_tokens"] += req.prompt_tokens or 0 

416 phase_stats[phase]["completion_tokens"] += ( 

417 req.completion_tokens or 0 

418 ) 

419 phase_stats[phase]["total_tokens"] += req.total_tokens or 0 

420 if req.context_truncated: 

421 phase_stats[phase]["truncated_count"] += 1 

422 

423 # Format requests for response 

424 requests_data = [] 

425 for req in token_usage: 

426 requests_data.append( 

427 { 

428 "timestamp": req.timestamp.isoformat(), 

429 "phase": req.research_phase, 

430 "prompt_tokens": req.prompt_tokens, 

431 "completion_tokens": req.completion_tokens, 

432 "total_tokens": req.total_tokens, 

433 "context_limit": req.context_limit, 

434 "context_truncated": bool(req.context_truncated), 

435 "tokens_truncated": req.tokens_truncated or 0, 

436 "ollama_prompt_eval_count": req.ollama_prompt_eval_count, 

437 "calling_function": req.calling_function, 

438 "response_time_ms": req.response_time_ms, 

439 } 

440 ) 

441 

442 response = { 

443 "status": "success", 

444 "data": { 

445 "overview": { 

446 "total_requests": len(token_usage), 

447 "total_tokens": total_tokens, 

448 "total_prompt_tokens": total_prompt, 

449 "total_completion_tokens": total_completion, 

450 "context_limit": context_limit, 

451 "max_tokens_used": max_tokens_used, 

452 "truncation_occurred": len(truncated_requests) > 0, 

453 "truncated_count": len(truncated_requests), 

454 "tokens_lost": sum( 

455 req.tokens_truncated or 0 

456 for req in truncated_requests 

457 ), 

458 }, 

459 "phase_stats": phase_stats, 

460 "requests": requests_data, 

461 "model": token_usage[0].model_name if token_usage else None, 

462 "provider": token_usage[0].model_provider 

463 if token_usage 

464 else None, 

465 }, 

466 } 

467 

468 return jsonify(response) 

469 

470 except Exception: 

471 logger.exception("Error getting research context overflow") 

472 return jsonify( 

473 { 

474 "status": "error", 

475 "message": "Failed to load context overflow data", 

476 } 

477 ), 500