Coverage for src/local_deep_research/web/routes/metrics_routes.py: 84%

902 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Routes for metrics dashboard.""" 

2 

3from datetime import datetime, timedelta, UTC 

4from typing import Any 

5from urllib.parse import urlparse 

6 

7from flask import Blueprint, jsonify, request, session as flask_session 

8from loguru import logger 

9from sqlalchemy import case, func 

10 

11from ...database.models import ( 

12 Journal, 

13 Paper, 

14 PaperAppearance, 

15 RateLimitAttempt, 

16 RateLimitEstimate, 

17 Research, 

18 ResearchHistory, 

19 ResearchRating, 

20 ResearchResource, 

21 ResearchStrategy, 

22 TokenUsage, 

23) 

24from ...constants import get_available_strategies 

25from ...domain_classifier import DomainClassifier, DomainClassification 

26from ...database.session_context import get_user_db_session 

27from ...metrics import TokenCounter 

28from ...metrics.query_utils import ( 

29 get_context_overflow_truncation_summary, 

30 get_period_days, 

31 get_time_filter_condition, 

32) 

33from ...metrics.search_tracker import get_search_tracker 

34from ...web_search_engines.rate_limiting import get_tracker 

35from ...security.decorators import require_json_body 

36from ...security.rate_limiter import journal_data_limit, journals_read_limit 

37from ..auth.decorators import login_required 

38from ..utils.templates import render_template_with_defaults 

39 

40# Create a Blueprint for metrics 

41metrics_bp = Blueprint("metrics", __name__, url_prefix="/metrics") 

42 

43# NOTE: Routes use flask_session["username"] (not .get()) intentionally. 

44# @login_required guarantees the key exists; direct access fails fast 

45# if the decorator is ever removed. 

46 

47 

48def _extract_domain(url): 

49 """Extract normalized domain from URL, stripping www. prefix.""" 

50 try: 

51 parsed = urlparse(url) 

52 domain = parsed.netloc.lower() 

53 if domain.startswith("www."): 

54 domain = domain[4:] 

55 return domain if domain else None 

56 except (ValueError, AttributeError, TypeError): 

57 return None 

58 

59 

60def get_rating_analytics(period="30d", research_mode="all", username=None): 

61 """Get rating analytics for the specified period and research mode.""" 

62 try: 

63 if not username: 

64 username = flask_session.get("username") 

65 

66 if not username: 

67 return { 

68 "rating_analytics": { 

69 "avg_rating": None, 

70 "total_ratings": 0, 

71 "rating_distribution": {}, 

72 "satisfaction_stats": { 

73 "very_satisfied": 0, 

74 "satisfied": 0, 

75 "neutral": 0, 

76 "dissatisfied": 0, 

77 "very_dissatisfied": 0, 

78 }, 

79 "error": "No user session", 

80 } 

81 } 

82 

83 # Calculate date range 

84 days = get_period_days(period) 

85 

86 with get_user_db_session(username) as session: 

87 query = session.query(ResearchRating) 

88 

89 # Apply time filter 

90 if days: 

91 cutoff_date = datetime.now(UTC) - timedelta(days=days) 

92 query = query.filter(ResearchRating.created_at >= cutoff_date) 

93 

94 # Get all ratings 

95 ratings = query.all() 

96 

97 if not ratings: 

98 return { 

99 "rating_analytics": { 

100 "avg_rating": None, 

101 "total_ratings": 0, 

102 "rating_distribution": {}, 

103 "satisfaction_stats": { 

104 "very_satisfied": 0, 

105 "satisfied": 0, 

106 "neutral": 0, 

107 "dissatisfied": 0, 

108 "very_dissatisfied": 0, 

109 }, 

110 } 

111 } 

112 

113 # Calculate statistics 

114 rating_values = [r.rating for r in ratings] 

115 avg_rating = sum(rating_values) / len(rating_values) 

116 

117 # Rating distribution 

118 rating_counts = {} 

119 for i in range(1, 6): 

120 rating_counts[str(i)] = rating_values.count(i) 

121 

122 # Satisfaction categories 

123 satisfaction_stats = { 

124 "very_satisfied": rating_values.count(5), 

125 "satisfied": rating_values.count(4), 

126 "neutral": rating_values.count(3), 

127 "dissatisfied": rating_values.count(2), 

128 "very_dissatisfied": rating_values.count(1), 

129 } 

130 

131 return { 

132 "rating_analytics": { 

133 "avg_rating": round(avg_rating, 1), 

134 "total_ratings": len(ratings), 

135 "rating_distribution": rating_counts, 

136 "satisfaction_stats": satisfaction_stats, 

137 } 

138 } 

139 

140 except Exception: 

141 logger.exception("Error getting rating analytics") 

142 return { 

143 "rating_analytics": { 

144 "avg_rating": None, 

145 "total_ratings": 0, 

146 "rating_distribution": {}, 

147 "satisfaction_stats": { 

148 "very_satisfied": 0, 

149 "satisfied": 0, 

150 "neutral": 0, 

151 "dissatisfied": 0, 

152 "very_dissatisfied": 0, 

153 }, 

154 } 

155 } 

156 

157 

158def get_link_analytics(period="30d", username=None): 

159 """Get link analytics from research resources.""" 

160 try: 

161 if not username: 

162 username = flask_session.get("username") 

163 

164 if not username: 

165 return { 

166 "link_analytics": { 

167 "top_domains": [], 

168 "total_unique_domains": 0, 

169 "avg_links_per_research": 0, 

170 "domain_distribution": {}, 

171 "source_type_analysis": {}, 

172 "academic_vs_general": {}, 

173 "total_links": 0, 

174 "error": "No user session", 

175 } 

176 } 

177 

178 # Calculate date range 

179 days = get_period_days(period) 

180 

181 with get_user_db_session(username) as session: 

182 # Base query 

183 query = session.query(ResearchResource) 

184 

185 # Apply time filter 

186 if days: 

187 cutoff_date = datetime.now(UTC) - timedelta(days=days) 

188 query = query.filter( 

189 ResearchResource.created_at >= cutoff_date.isoformat() 

190 ) 

191 

192 # Get all resources 

193 resources = query.all() 

194 

195 if not resources: 

196 return { 

197 "link_analytics": { 

198 "top_domains": [], 

199 "total_unique_domains": 0, 

200 "avg_links_per_research": 0, 

201 "domain_distribution": {}, 

202 "source_type_analysis": {}, 

203 "academic_vs_general": {}, 

204 "total_links": 0, 

205 } 

206 } 

207 

208 # Extract domains from URLs 

209 domain_counts: dict[str, Any] = {} 

210 domain_researches: dict[ 

211 str, Any 

212 ] = {} # Track which researches used each domain 

213 source_types: dict[str, Any] = {} 

214 temporal_data: dict[str, Any] = {} # Track links over time 

215 domain_connections: dict[ 

216 str, Any 

217 ] = {} # Track domain co-occurrences 

218 

219 # Generic category counting from LLM classifications 

220 category_counts: dict[str, Any] = {} 

221 

222 quality_metrics = { 

223 "with_title": 0, 

224 "with_preview": 0, 

225 "with_both": 0, 

226 "total": 0, 

227 } 

228 

229 # First pass: collect all domains from resources 

230 all_domains = set() 

231 for resource in resources: 

232 if resource.url: 

233 domain = _extract_domain(resource.url) 

234 if domain: 234 ↛ 231line 234 didn't jump to line 231 because the condition on line 234 was always true

235 all_domains.add(domain) 

236 

237 # Batch load all domain classifications in one query (fix N+1) 

238 domain_classifications_map = {} 

239 if all_domains: 

240 all_classifications = ( 

241 session.query(DomainClassification) 

242 .filter(DomainClassification.domain.in_(all_domains)) 

243 .all() 

244 ) 

245 for classification in all_classifications: 

246 domain_classifications_map[classification.domain] = ( 

247 classification 

248 ) 

249 

250 # Second pass: process resources with pre-loaded classifications 

251 for resource in resources: 

252 if resource.url: 

253 try: 

254 domain = _extract_domain(resource.url) 

255 if not domain: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 continue 

257 

258 # Count domains 

259 domain_counts[domain] = domain_counts.get(domain, 0) + 1 

260 

261 # Track research IDs for each domain 

262 if domain not in domain_researches: 

263 domain_researches[domain] = set() 

264 domain_researches[domain].add(resource.research_id) 

265 

266 # Track temporal data (daily counts) 

267 if resource.created_at: 267 ↛ 276line 267 didn't jump to line 276 because the condition on line 267 was always true

268 date_str = resource.created_at[ 

269 :10 

270 ] # Extract YYYY-MM-DD 

271 temporal_data[date_str] = ( 

272 temporal_data.get(date_str, 0) + 1 

273 ) 

274 

275 # Count categories from pre-loaded classifications (no N+1) 

276 classification = domain_classifications_map.get(domain) 

277 if classification: 

278 category = classification.category 

279 category_counts[category] = ( 

280 category_counts.get(category, 0) + 1 

281 ) 

282 else: 

283 category_counts["Unclassified"] = ( 

284 category_counts.get("Unclassified", 0) + 1 

285 ) 

286 

287 # Track source type from metadata if available 

288 if resource.source_type: 

289 source_types[resource.source_type] = ( 

290 source_types.get(resource.source_type, 0) + 1 

291 ) 

292 

293 # Track quality metrics 

294 quality_metrics["total"] += 1 

295 if resource.title: 

296 quality_metrics["with_title"] += 1 

297 if resource.content_preview: 

298 quality_metrics["with_preview"] += 1 

299 if resource.title and resource.content_preview: 

300 quality_metrics["with_both"] += 1 

301 

302 # Track domain co-occurrences for network visualization 

303 research_id = resource.research_id 

304 if research_id not in domain_connections: 

305 domain_connections[research_id] = [] 

306 domain_connections[research_id].append(domain) 

307 

308 except Exception: 

309 logger.exception(f"Error parsing URL {resource.url}") 

310 

311 # Sort domains by count and get top 10 

312 sorted_domains = sorted( 

313 domain_counts.items(), key=lambda x: x[1], reverse=True 

314 ) 

315 top_10_domains = sorted_domains[:10] 

316 

317 # Calculate domain distribution (top domains vs others) 

318 top_10_count = sum(count for _, count in top_10_domains) 

319 others_count = len(resources) - top_10_count 

320 

321 # Get unique research IDs to calculate average 

322 unique_research_ids = {r.research_id for r in resources} 

323 avg_links = ( 

324 len(resources) / len(unique_research_ids) 

325 if unique_research_ids 

326 else 0 

327 ) 

328 

329 # Prepare temporal trend data (sorted by date) 

330 temporal_trend = sorted( 

331 [ 

332 {"date": date, "count": count} 

333 for date, count in temporal_data.items() 

334 ], 

335 key=lambda x: x["date"], 

336 ) 

337 

338 # Get most recent research for each top domain and classifications 

339 domain_recent_research = {} 

340 # Build domain_classifications dict from pre-loaded data 

341 domain_classifications = { 

342 domain: { 

343 "category": classification.category, 

344 "subcategory": classification.subcategory, 

345 "confidence": classification.confidence, 

346 } 

347 for domain, classification in domain_classifications_map.items() 

348 } 

349 

350 # Batch-load research details for top domains (fix N+1 query) 

351 all_research_ids = [] 

352 domain_research_id_lists = {} 

353 for domain, _ in top_10_domains: 

354 if domain in domain_researches: 354 ↛ 353line 354 didn't jump to line 353 because the condition on line 354 was always true

355 ids = list(domain_researches[domain])[:3] 

356 domain_research_id_lists[domain] = ids 

357 all_research_ids.extend(ids) 

358 

359 research_by_id = {} 

360 if all_research_ids: 

361 researches = ( 

362 session.query(ResearchHistory) 

363 .filter(ResearchHistory.id.in_(all_research_ids)) 

364 .all() 

365 ) 

366 research_by_id = {r.id: r for r in researches} 

367 

368 for domain, ids in domain_research_id_lists.items(): 

369 domain_recent_research[domain] = [ 

370 { 

371 "id": r_id, 

372 "query": research_by_id[r_id].query[:50] 

373 if research_by_id.get(r_id) 

374 and research_by_id[r_id].query 

375 else "Research", 

376 } 

377 for r_id in ids 

378 if r_id in research_by_id 

379 ] 

380 

381 return { 

382 "link_analytics": { 

383 "top_domains": [ 

384 { 

385 "domain": domain, 

386 "count": count, 

387 "percentage": round( 

388 count / len(resources) * 100, 1 

389 ), 

390 "research_count": len( 

391 domain_researches.get(domain, set()) 

392 ), 

393 "recent_researches": domain_recent_research.get( 

394 domain, [] 

395 ), 

396 "classification": domain_classifications.get( 

397 domain, None 

398 ), 

399 } 

400 for domain, count in top_10_domains 

401 ], 

402 "total_unique_domains": len(domain_counts), 

403 "avg_links_per_research": round(avg_links, 1), 

404 "domain_distribution": { 

405 "top_10": top_10_count, 

406 "others": others_count, 

407 }, 

408 "source_type_analysis": source_types, 

409 "category_distribution": category_counts, 

410 # Generic pie chart data - use whatever LLM classifier outputs 

411 "domain_categories": category_counts, 

412 "total_links": len(resources), 

413 "total_researches": len(unique_research_ids), 

414 "temporal_trend": temporal_trend, 

415 "domain_metrics": { 

416 domain: { 

417 "usage_count": count, 

418 "usage_percentage": round( 

419 count / len(resources) * 100, 1 

420 ), 

421 "research_diversity": len( 

422 domain_researches.get(domain, set()) 

423 ), 

424 "frequency_rank": rank + 1, 

425 } 

426 for rank, (domain, count) in enumerate(top_10_domains) 

427 }, 

428 } 

429 } 

430 

431 except Exception: 

432 logger.exception("Error getting link analytics") 

433 return { 

434 "link_analytics": { 

435 "top_domains": [], 

436 "total_unique_domains": 0, 

437 "avg_links_per_research": 0, 

438 "domain_distribution": {}, 

439 "source_type_analysis": {}, 

440 "academic_vs_general": {}, 

441 "total_links": 0, 

442 "error": "Failed to retrieve link analytics", 

443 } 

444 } 

445 

446 

447def get_strategy_analytics(period="30d", username=None): 

448 """Get strategy usage analytics for the specified period.""" 

449 try: 

450 if not username: 

451 username = flask_session.get("username") 

452 

453 if not username: 

454 return { 

455 "strategy_analytics": { 

456 "total_research_with_strategy": 0, 

457 "total_research": 0, 

458 "most_popular_strategy": None, 

459 "strategy_usage": [], 

460 "strategy_distribution": {}, 

461 "available_strategies": get_available_strategies(), 

462 "error": "No user session", 

463 } 

464 } 

465 

466 # Calculate date range 

467 days = get_period_days(period) 

468 

469 with get_user_db_session(username) as session: 

470 # Check if we have any ResearchStrategy records 

471 strategy_count = session.query(ResearchStrategy).count() 

472 

473 if strategy_count == 0: 

474 logger.warning("No research strategies found in database") 

475 return { 

476 "strategy_analytics": { 

477 "total_research_with_strategy": 0, 

478 "total_research": 0, 

479 "most_popular_strategy": None, 

480 "strategy_usage": [], 

481 "strategy_distribution": {}, 

482 "available_strategies": get_available_strategies(), 

483 "message": "Strategy tracking not yet available - run a research to start tracking", 

484 } 

485 } 

486 

487 # Base query for strategy usage (no JOIN needed since we just want strategy counts) 

488 query = session.query( 

489 ResearchStrategy.strategy_name, 

490 func.count(ResearchStrategy.id).label("usage_count"), 

491 ) 

492 

493 # Apply time filter if specified 

494 if days: 

495 cutoff_date = datetime.now(UTC) - timedelta(days=days) 

496 query = query.filter(ResearchStrategy.created_at >= cutoff_date) 

497 

498 # Group by strategy and order by usage 

499 strategy_results = ( 

500 query.group_by(ResearchStrategy.strategy_name) 

501 .order_by(func.count(ResearchStrategy.id).desc()) 

502 .all() 

503 ) 

504 

505 # Get total strategy count for percentage calculation 

506 total_query = session.query(ResearchStrategy) 

507 if days: 

508 total_query = total_query.filter( 

509 ResearchStrategy.created_at >= cutoff_date 

510 ) 

511 total_research = total_query.count() 

512 

513 # Format strategy data 

514 strategy_usage = [] 

515 strategy_distribution = {} 

516 

517 for strategy_name, usage_count in strategy_results: 

518 percentage = ( 

519 (usage_count / total_research * 100) 

520 if total_research > 0 

521 else 0 

522 ) 

523 strategy_usage.append( 

524 { 

525 "strategy": strategy_name, 

526 "count": usage_count, 

527 "percentage": round(percentage, 1), 

528 } 

529 ) 

530 strategy_distribution[strategy_name] = usage_count 

531 

532 # Find most popular strategy 

533 most_popular = ( 

534 strategy_usage[0]["strategy"] if strategy_usage else None 

535 ) 

536 

537 return { 

538 "strategy_analytics": { 

539 "total_research_with_strategy": sum( 

540 item["count"] for item in strategy_usage 

541 ), 

542 "total_research": total_research, 

543 "most_popular_strategy": most_popular, 

544 "strategy_usage": strategy_usage, 

545 "strategy_distribution": strategy_distribution, 

546 "available_strategies": get_available_strategies(), 

547 } 

548 } 

549 

550 except Exception: 

551 logger.exception("Error getting strategy analytics") 

552 return { 

553 "strategy_analytics": { 

554 "total_research_with_strategy": 0, 

555 "total_research": 0, 

556 "most_popular_strategy": None, 

557 "strategy_usage": [], 

558 "strategy_distribution": {}, 

559 "available_strategies": get_available_strategies(), 

560 "error": "Failed to retrieve strategy data", 

561 } 

562 } 

563 

564 

565def get_rate_limiting_analytics(period="30d", username=None): 

566 """Get rate limiting analytics for the specified period.""" 

567 try: 

568 if not username: 

569 username = flask_session.get("username") 

570 

571 if not username: 

572 return { 

573 "rate_limiting": { 

574 "total_attempts": 0, 

575 "successful_attempts": 0, 

576 "failed_attempts": 0, 

577 "success_rate": 0, 

578 "rate_limit_events": 0, 

579 "avg_wait_time": 0, 

580 "avg_successful_wait": 0, 

581 "tracked_engines": 0, 

582 "engine_stats": [], 

583 "total_engines_tracked": 0, 

584 "healthy_engines": 0, 

585 "degraded_engines": 0, 

586 "poor_engines": 0, 

587 "error": "No user session", 

588 } 

589 } 

590 

591 # Calculate date range for timestamp filtering 

592 import time 

593 

594 if period == "7d": 

595 cutoff_time = time.time() - (7 * 24 * 3600) 

596 elif period == "30d": 

597 cutoff_time = time.time() - (30 * 24 * 3600) 

598 elif period == "3m": 

599 cutoff_time = time.time() - (90 * 24 * 3600) 

600 elif period == "1y": 

601 cutoff_time = time.time() - (365 * 24 * 3600) 

602 else: # all 

603 cutoff_time = 0 

604 

605 with get_user_db_session(username) as session: 

606 # Get rate limit attempts 

607 rate_limit_query = session.query(RateLimitAttempt) 

608 

609 # Apply time filter 

610 if cutoff_time > 0: 

611 rate_limit_query = rate_limit_query.filter( 

612 RateLimitAttempt.timestamp >= cutoff_time 

613 ) 

614 

615 # Get rate limit statistics 

616 total_attempts = rate_limit_query.count() 

617 successful_attempts = rate_limit_query.filter( 

618 RateLimitAttempt.success 

619 ).count() 

620 failed_attempts = total_attempts - successful_attempts 

621 

622 # Count rate limiting events (failures with RateLimitError) 

623 rate_limit_events = rate_limit_query.filter( 

624 ~RateLimitAttempt.success, 

625 RateLimitAttempt.error_type == "RateLimitError", 

626 ).count() 

627 

628 logger.info( 

629 f"Rate limit attempts in database: total={total_attempts}, successful={successful_attempts}" 

630 ) 

631 

632 # Get all attempts for detailed calculations 

633 attempts = rate_limit_query.all() 

634 

635 # Calculate average wait times 

636 if attempts: 

637 avg_wait_time = sum(a.wait_time for a in attempts) / len( 

638 attempts 

639 ) 

640 successful_wait_times = [ 

641 a.wait_time for a in attempts if a.success 

642 ] 

643 avg_successful_wait = ( 

644 sum(successful_wait_times) / len(successful_wait_times) 

645 if successful_wait_times 

646 else 0 

647 ) 

648 else: 

649 avg_wait_time = 0 

650 avg_successful_wait = 0 

651 

652 # Get tracked engines - count distinct engine types from attempts 

653 tracked_engines_query = session.query( 

654 func.count(func.distinct(RateLimitAttempt.engine_type)) 

655 ) 

656 if cutoff_time > 0: 

657 tracked_engines_query = tracked_engines_query.filter( 

658 RateLimitAttempt.timestamp >= cutoff_time 

659 ) 

660 tracked_engines = tracked_engines_query.scalar() or 0 

661 

662 # Get engine-specific stats from attempts 

663 engine_stats = [] 

664 

665 # Get distinct engine types from attempts 

666 engine_types_query = session.query( 

667 RateLimitAttempt.engine_type 

668 ).distinct() 

669 if cutoff_time > 0: 

670 engine_types_query = engine_types_query.filter( 

671 RateLimitAttempt.timestamp >= cutoff_time 

672 ) 

673 engine_types = [row.engine_type for row in engine_types_query.all()] 

674 

675 # Preload estimates for relevant engines to avoid N+1 queries 

676 estimates_by_engine = {} 

677 if engine_types: 

678 all_estimates = ( 

679 session.query(RateLimitEstimate) 

680 .filter(RateLimitEstimate.engine_type.in_(engine_types)) 

681 .all() 

682 ) 

683 estimates_by_engine = {e.engine_type: e for e in all_estimates} 

684 

685 for engine_type in engine_types: 

686 engine_attempts_list = [ 

687 a for a in attempts if a.engine_type == engine_type 

688 ] 

689 engine_attempts = len(engine_attempts_list) 

690 engine_success = len( 

691 [a for a in engine_attempts_list if a.success] 

692 ) 

693 

694 # Get estimate from preloaded dict 

695 estimate = estimates_by_engine.get(engine_type) 

696 

697 # Calculate recent success rate 

698 recent_success_rate = ( 

699 (engine_success / engine_attempts * 100) 

700 if engine_attempts > 0 

701 else 0 

702 ) 

703 

704 # Determine status based on success rate 

705 if estimate: 

706 status = ( 

707 "healthy" 

708 if estimate.success_rate > 0.8 

709 else "degraded" 

710 if estimate.success_rate > 0.5 

711 else "poor" 

712 ) 

713 else: 

714 status = ( 

715 "healthy" 

716 if recent_success_rate > 80 

717 else "degraded" 

718 if recent_success_rate > 50 

719 else "poor" 

720 ) 

721 

722 engine_stat = { 

723 "engine": engine_type, 

724 "base_wait": estimate.base_wait_seconds 

725 if estimate 

726 else 0.0, 

727 "base_wait_seconds": round( 

728 estimate.base_wait_seconds if estimate else 0.0, 2 

729 ), 

730 "min_wait_seconds": round( 

731 estimate.min_wait_seconds if estimate else 0.0, 2 

732 ), 

733 "max_wait_seconds": round( 

734 estimate.max_wait_seconds if estimate else 0.0, 2 

735 ), 

736 "success_rate": round(estimate.success_rate * 100, 1) 

737 if estimate 

738 else recent_success_rate, 

739 "total_attempts": estimate.total_attempts 

740 if estimate 

741 else engine_attempts, 

742 "recent_attempts": engine_attempts, 

743 "recent_success_rate": round(recent_success_rate, 1), 

744 "attempts": engine_attempts, 

745 "status": status, 

746 } 

747 

748 if estimate: 

749 from datetime import datetime 

750 

751 engine_stat["last_updated"] = datetime.fromtimestamp( 

752 estimate.last_updated, UTC 

753 ).isoformat() # ISO format already includes timezone 

754 else: 

755 engine_stat["last_updated"] = "Never" 

756 

757 engine_stats.append(engine_stat) 

758 

759 logger.info( 

760 f"Tracked engines: {tracked_engines}, engine_stats: {engine_stats}" 

761 ) 

762 

763 result = { 

764 "rate_limiting": { 

765 "total_attempts": total_attempts, 

766 "successful_attempts": successful_attempts, 

767 "failed_attempts": failed_attempts, 

768 "success_rate": (successful_attempts / total_attempts * 100) 

769 if total_attempts > 0 

770 else 0, 

771 "rate_limit_events": rate_limit_events, 

772 "avg_wait_time": round(float(avg_wait_time), 2), 

773 "avg_successful_wait": round(float(avg_successful_wait), 2), 

774 "tracked_engines": tracked_engines, 

775 "engine_stats": engine_stats, 

776 "total_engines_tracked": tracked_engines, 

777 "healthy_engines": len( 

778 [s for s in engine_stats if s["status"] == "healthy"] 

779 ), 

780 "degraded_engines": len( 

781 [s for s in engine_stats if s["status"] == "degraded"] 

782 ), 

783 "poor_engines": len( 

784 [s for s in engine_stats if s["status"] == "poor"] 

785 ), 

786 } 

787 } 

788 

789 logger.info( 

790 f"DEBUG: Returning rate_limiting_analytics result: {result}" 

791 ) 

792 return result 

793 

794 except Exception: 

795 logger.exception("Error getting rate limiting analytics") 

796 return { 

797 "rate_limiting": { 

798 "total_attempts": 0, 

799 "successful_attempts": 0, 

800 "failed_attempts": 0, 

801 "success_rate": 0, 

802 "rate_limit_events": 0, 

803 "avg_wait_time": 0, 

804 "avg_successful_wait": 0, 

805 "tracked_engines": 0, 

806 "engine_stats": [], 

807 "total_engines_tracked": 0, 

808 "healthy_engines": 0, 

809 "degraded_engines": 0, 

810 "poor_engines": 0, 

811 "error": "An internal error occurred while processing the request.", 

812 } 

813 } 

814 

815 

816@metrics_bp.route("/") 

817@login_required 

818def metrics_dashboard(): 

819 """Render the metrics dashboard page.""" 

820 return render_template_with_defaults("pages/metrics.html") 

821 

822 

823@metrics_bp.route("/context-overflow") 

824@login_required 

825def context_overflow_page(): 

826 """Context overflow analytics page.""" 

827 return render_template_with_defaults("pages/context_overflow.html") 

828 

829 

830@metrics_bp.route("/api/metrics") 

831@login_required 

832def api_metrics(): 

833 """Get overall metrics data.""" 

834 logger.debug("api_metrics endpoint called") 

835 try: 

836 # Get username from session 

837 username = flask_session["username"] 

838 

839 # Get time period and research mode from query parameters 

840 period = request.args.get("period", "30d") 

841 research_mode = request.args.get("mode", "all") 

842 

843 token_counter = TokenCounter() 

844 search_tracker = get_search_tracker() 

845 

846 # Get both token and search metrics 

847 token_metrics = token_counter.get_overall_metrics( 

848 period=period, research_mode=research_mode 

849 ) 

850 search_metrics = search_tracker.get_search_metrics( 

851 period=period, 

852 research_mode=research_mode, 

853 username=username, 

854 ) 

855 

856 # Get user satisfaction rating data 

857 try: 

858 with get_user_db_session(username) as session: 

859 # Build base query with time filter 

860 ratings_query = session.query(ResearchRating) 

861 time_condition = get_time_filter_condition( 

862 period, ResearchRating.created_at 

863 ) 

864 if time_condition is not None: 

865 ratings_query = ratings_query.filter(time_condition) 

866 

867 # Get average rating 

868 avg_rating = ratings_query.with_entities( 

869 func.avg(ResearchRating.rating).label("avg_rating") 

870 ).scalar() 

871 

872 # Get total rating count 

873 total_ratings = ratings_query.count() 

874 

875 user_satisfaction = { 

876 "avg_rating": round(avg_rating, 1) if avg_rating else None, 

877 "total_ratings": total_ratings, 

878 } 

879 except Exception: 

880 logger.exception("Error getting user satisfaction data") 

881 user_satisfaction = {"avg_rating": None, "total_ratings": 0} 

882 

883 # Get strategy analytics 

884 strategy_data = get_strategy_analytics(period, username) 

885 logger.debug(f"strategy_data keys: {list(strategy_data.keys())}") 

886 

887 # Get rate limiting analytics 

888 rate_limiting_data = get_rate_limiting_analytics(period, username) 

889 logger.debug(f"rate_limiting_data: {rate_limiting_data}") 

890 logger.debug( 

891 f"rate_limiting_data keys: {list(rate_limiting_data.keys())}" 

892 ) 

893 

894 # Truncation summary surfaced on the main dashboard. Failure sentinel 

895 # is None (not 0): a real zero means "no truncation", so falling back 

896 # to 0 on error would silently flip a red signal green. 

897 context_overflow_data = { 

898 "truncation_rate": None, 

899 "avg_tokens_truncated": None, 

900 } 

901 try: 

902 with get_user_db_session(username) as session: 

903 # Honor the dashboard's research_mode filter the same way the 

904 # rest of api_metrics() does (token_metrics, search_metrics, 

905 # etc.). Without this the panel ignores mode toggles. 

906 summary = get_context_overflow_truncation_summary( 

907 session, period, research_mode=research_mode 

908 ) 

909 context_overflow_data = { 

910 "truncation_rate": round(summary["truncation_rate"], 1), 

911 "avg_tokens_truncated": int(summary["avg_tokens_truncated"]), 

912 } 

913 except Exception: 

914 logger.exception( 

915 "Error getting context overflow summary for /api/metrics" 

916 ) 

917 

918 # Combine metrics 

919 combined_metrics = { 

920 **token_metrics, 

921 **search_metrics, 

922 **strategy_data, 

923 **rate_limiting_data, 

924 **context_overflow_data, 

925 "user_satisfaction": user_satisfaction, 

926 } 

927 

928 logger.debug(f"combined_metrics keys: {list(combined_metrics.keys())}") 

929 logger.debug( 

930 f"combined_metrics['rate_limiting']: {combined_metrics.get('rate_limiting', 'NOT FOUND')}" 

931 ) 

932 

933 return jsonify( 

934 { 

935 "status": "success", 

936 "metrics": combined_metrics, 

937 "period": period, 

938 "research_mode": research_mode, 

939 } 

940 ) 

941 except Exception: 

942 logger.exception("Error getting metrics") 

943 return ( 

944 jsonify( 

945 { 

946 "status": "error", 

947 "message": "An internal error occurred. Please try again later.", 

948 } 

949 ), 

950 500, 

951 ) 

952 

953 

954@metrics_bp.route("/api/rate-limiting") 

955@login_required 

956def api_rate_limiting_metrics(): 

957 """Get detailed rate limiting metrics.""" 

958 # KNOWN-DEFERRED: debug log left in during development. Not harmful 

959 # (no PII, just marks endpoint entry) but noisy — post-merge cleanup. 

960 logger.info("DEBUG: api_rate_limiting_metrics endpoint called") 

961 try: 

962 username = flask_session["username"] 

963 period = request.args.get("period", "30d") 

964 rate_limiting_data = get_rate_limiting_analytics(period, username) 

965 

966 return jsonify( 

967 {"status": "success", "data": rate_limiting_data, "period": period} 

968 ) 

969 except Exception: 

970 logger.exception("Error getting rate limiting metrics") 

971 return jsonify( 

972 { 

973 "status": "error", 

974 "message": "Failed to retrieve rate limiting metrics", 

975 } 

976 ), 500 

977 

978 

979@metrics_bp.route("/api/rate-limiting/current") 

980@login_required 

981def api_current_rate_limits(): 

982 """Get current rate limit estimates for all engines.""" 

983 try: 

984 tracker = get_tracker() 

985 stats = tracker.get_stats() 

986 

987 current_limits = [] 

988 for stat in stats: 

989 ( 

990 engine_type, 

991 base_wait, 

992 min_wait, 

993 max_wait, 

994 last_updated, 

995 total_attempts, 

996 success_rate, 

997 ) = stat 

998 current_limits.append( 

999 { 

1000 "engine_type": engine_type, 

1001 "base_wait_seconds": round(base_wait, 2), 

1002 "min_wait_seconds": round(min_wait, 2), 

1003 "max_wait_seconds": round(max_wait, 2), 

1004 "success_rate": round(success_rate * 100, 1), 

1005 "total_attempts": total_attempts, 

1006 "last_updated": datetime.fromtimestamp( 

1007 last_updated, UTC 

1008 ).isoformat(), # ISO format already includes timezone 

1009 "status": "healthy" 

1010 if success_rate > 0.8 

1011 else "degraded" 

1012 if success_rate > 0.5 

1013 else "poor", 

1014 } 

1015 ) 

1016 

1017 return jsonify( 

1018 { 

1019 "status": "success", 

1020 "current_limits": current_limits, 

1021 "timestamp": datetime.now(UTC).isoformat(), 

1022 } 

1023 ) 

1024 except Exception: 

1025 logger.exception("Error getting current rate limits") 

1026 return jsonify( 

1027 { 

1028 "status": "error", 

1029 "message": "Failed to retrieve current rate limits", 

1030 } 

1031 ), 500 

1032 

1033 

1034@metrics_bp.route("/api/metrics/research/<string:research_id>/links") 

1035@login_required 

1036def api_research_link_metrics(research_id): 

1037 """Get link analytics for a specific research.""" 

1038 try: 

1039 username = flask_session["username"] 

1040 

1041 with get_user_db_session(username) as session: 

1042 # Get all resources for this specific research 

1043 resources = ( 

1044 session.query(ResearchResource) 

1045 .filter(ResearchResource.research_id == research_id) 

1046 .all() 

1047 ) 

1048 

1049 if not resources: 

1050 return jsonify( 

1051 { 

1052 "status": "success", 

1053 "data": { 

1054 "total_links": 0, 

1055 "unique_domains": 0, 

1056 "domains": [], 

1057 "category_distribution": {}, 

1058 "domain_categories": {}, 

1059 "resources": [], 

1060 }, 

1061 } 

1062 ) 

1063 

1064 # Extract domain information 

1065 domain_counts: dict[str, Any] = {} 

1066 

1067 # Generic category counting from LLM classifications 

1068 category_counts: dict[str, Any] = {} 

1069 

1070 # First pass: collect all domains 

1071 all_domains = set() 

1072 for resource in resources: 

1073 if resource.url: 1073 ↛ 1072line 1073 didn't jump to line 1072 because the condition on line 1073 was always true

1074 domain = _extract_domain(resource.url) 

1075 if domain: 1075 ↛ 1072line 1075 didn't jump to line 1072 because the condition on line 1075 was always true

1076 all_domains.add(domain) 

1077 

1078 # Batch load all domain classifications in one query (fix N+1) 

1079 domain_classifications_map = {} 

1080 if all_domains: 1080 ↛ 1092line 1080 didn't jump to line 1092 because the condition on line 1080 was always true

1081 all_classifications = ( 

1082 session.query(DomainClassification) 

1083 .filter(DomainClassification.domain.in_(all_domains)) 

1084 .all() 

1085 ) 

1086 for classification in all_classifications: 

1087 domain_classifications_map[classification.domain] = ( 

1088 classification 

1089 ) 

1090 

1091 # Second pass: process resources with pre-loaded classifications 

1092 for resource in resources: 

1093 if resource.url: 1093 ↛ 1092line 1093 didn't jump to line 1092 because the condition on line 1093 was always true

1094 try: 

1095 domain = _extract_domain(resource.url) 

1096 if not domain: 1096 ↛ 1097line 1096 didn't jump to line 1097 because the condition on line 1096 was never true

1097 continue 

1098 

1099 domain_counts[domain] = domain_counts.get(domain, 0) + 1 

1100 

1101 # Count categories from pre-loaded classifications (no N+1) 

1102 classification = domain_classifications_map.get(domain) 

1103 if classification: 

1104 category = classification.category 

1105 category_counts[category] = ( 

1106 category_counts.get(category, 0) + 1 

1107 ) 

1108 else: 

1109 category_counts["Unclassified"] = ( 

1110 category_counts.get("Unclassified", 0) + 1 

1111 ) 

1112 except (AttributeError, KeyError) as e: 

1113 logger.debug(f"Error classifying domain {domain}: {e}") 

1114 

1115 # Sort domains by count 

1116 sorted_domains = sorted( 

1117 domain_counts.items(), key=lambda x: x[1], reverse=True 

1118 ) 

1119 

1120 return jsonify( 

1121 { 

1122 "status": "success", 

1123 "data": { 

1124 "total_links": len(resources), 

1125 "unique_domains": len(domain_counts), 

1126 "domains": [ 

1127 { 

1128 "domain": domain, 

1129 "count": count, 

1130 "percentage": round( 

1131 count / len(resources) * 100, 1 

1132 ), 

1133 } 

1134 for domain, count in sorted_domains[ 

1135 :20 

1136 ] # Top 20 domains 

1137 ], 

1138 "category_distribution": category_counts, 

1139 "domain_categories": category_counts, # Generic categories from LLM 

1140 "resources": [ 

1141 { 

1142 "title": r.title or "Untitled", 

1143 "url": r.url, 

1144 "preview": r.content_preview[:200] 

1145 if r.content_preview 

1146 else None, 

1147 } 

1148 for r in resources[:10] # First 10 resources 

1149 ], 

1150 }, 

1151 } 

1152 ) 

1153 

1154 except Exception: 

1155 logger.exception("Error getting research link metrics") 

1156 return jsonify( 

1157 {"status": "error", "message": "Failed to retrieve link metrics"} 

1158 ), 500 

1159 

1160 

1161@metrics_bp.route("/api/metrics/research/<string:research_id>") 

1162@login_required 

1163def api_research_metrics(research_id): 

1164 """Get metrics for a specific research.""" 

1165 try: 

1166 token_counter = TokenCounter() 

1167 metrics = token_counter.get_research_metrics(research_id) 

1168 return jsonify({"status": "success", "metrics": metrics}) 

1169 except Exception: 

1170 logger.exception("Error getting research metrics") 

1171 return ( 

1172 jsonify( 

1173 { 

1174 "status": "error", 

1175 "message": "An internal error occurred. Please try again later.", 

1176 } 

1177 ), 

1178 500, 

1179 ) 

1180 

1181 

1182@metrics_bp.route("/api/metrics/research/<string:research_id>/timeline") 

1183@login_required 

1184def api_research_timeline_metrics(research_id): 

1185 """Get timeline metrics for a specific research.""" 

1186 try: 

1187 token_counter = TokenCounter() 

1188 timeline_metrics = token_counter.get_research_timeline_metrics( 

1189 research_id 

1190 ) 

1191 return jsonify({"status": "success", "metrics": timeline_metrics}) 

1192 except Exception: 

1193 logger.exception("Error getting research timeline metrics") 

1194 return ( 

1195 jsonify( 

1196 { 

1197 "status": "error", 

1198 "message": "An internal error occurred. Please try again later.", 

1199 } 

1200 ), 

1201 500, 

1202 ) 

1203 

1204 

1205@metrics_bp.route("/api/metrics/research/<string:research_id>/search") 

1206@login_required 

1207def api_research_search_metrics(research_id): 

1208 """Get search metrics for a specific research.""" 

1209 try: 

1210 username = flask_session["username"] 

1211 search_tracker = get_search_tracker() 

1212 search_metrics = search_tracker.get_research_search_metrics( 

1213 research_id, username=username 

1214 ) 

1215 return jsonify({"status": "success", "metrics": search_metrics}) 

1216 except Exception: 

1217 logger.exception("Error getting research search metrics") 

1218 return ( 

1219 jsonify( 

1220 { 

1221 "status": "error", 

1222 "message": "An internal error occurred. Please try again later.", 

1223 } 

1224 ), 

1225 500, 

1226 ) 

1227 

1228 

1229@metrics_bp.route("/api/metrics/enhanced") 

1230@login_required 

1231def api_enhanced_metrics(): 

1232 """Get enhanced Phase 1 tracking metrics.""" 

1233 try: 

1234 # Get time period and research mode from query parameters 

1235 period = request.args.get("period", "30d") 

1236 research_mode = request.args.get("mode", "all") 

1237 username = flask_session["username"] 

1238 

1239 token_counter = TokenCounter() 

1240 search_tracker = get_search_tracker() 

1241 

1242 enhanced_metrics = token_counter.get_enhanced_metrics( 

1243 period=period, research_mode=research_mode 

1244 ) 

1245 

1246 # Add search time series data for the chart 

1247 search_time_series = search_tracker.get_search_time_series( 

1248 period=period, 

1249 research_mode=research_mode, 

1250 username=username, 

1251 ) 

1252 enhanced_metrics["search_time_series"] = search_time_series 

1253 

1254 # Add rating analytics 

1255 rating_analytics = get_rating_analytics(period, research_mode, username) 

1256 enhanced_metrics.update(rating_analytics) 

1257 

1258 return jsonify( 

1259 { 

1260 "status": "success", 

1261 "metrics": enhanced_metrics, 

1262 "period": period, 

1263 "research_mode": research_mode, 

1264 } 

1265 ) 

1266 except Exception: 

1267 logger.exception("Error getting enhanced metrics") 

1268 return ( 

1269 jsonify( 

1270 { 

1271 "status": "error", 

1272 "message": "An internal error occurred. Please try again later.", 

1273 } 

1274 ), 

1275 500, 

1276 ) 

1277 

1278 

1279@metrics_bp.route("/api/ratings/<string:research_id>", methods=["GET"]) 

1280@login_required 

1281def api_get_research_rating(research_id): 

1282 """Get rating for a specific research session.""" 

1283 try: 

1284 username = flask_session["username"] 

1285 

1286 with get_user_db_session(username) as session: 

1287 rating = ( 

1288 session.query(ResearchRating) 

1289 .filter_by(research_id=research_id) 

1290 .first() 

1291 ) 

1292 

1293 if rating: 

1294 return jsonify( 

1295 { 

1296 "status": "success", 

1297 "rating": rating.rating, 

1298 "created_at": rating.created_at.isoformat(), 

1299 "updated_at": rating.updated_at.isoformat(), 

1300 } 

1301 ) 

1302 return jsonify({"status": "success", "rating": None}) 

1303 

1304 except Exception: 

1305 logger.exception("Error getting research rating") 

1306 return ( 

1307 jsonify( 

1308 { 

1309 "status": "error", 

1310 "message": "An internal error occurred. Please try again later.", 

1311 } 

1312 ), 

1313 500, 

1314 ) 

1315 

1316 

1317@metrics_bp.route("/api/ratings/<string:research_id>", methods=["POST"]) 

1318@login_required 

1319@require_json_body(error_format="status") 

1320def api_save_research_rating(research_id): 

1321 """Save or update rating for a specific research session.""" 

1322 try: 

1323 username = flask_session["username"] 

1324 

1325 data = request.get_json() 

1326 rating_value = data.get("rating") 

1327 

1328 if ( 

1329 not rating_value 

1330 or not isinstance(rating_value, int) 

1331 or rating_value < 1 

1332 or rating_value > 5 

1333 ): 

1334 return ( 

1335 jsonify( 

1336 { 

1337 "status": "error", 

1338 "message": "Rating must be an integer between 1 and 5", 

1339 } 

1340 ), 

1341 400, 

1342 ) 

1343 

1344 with get_user_db_session(username) as session: 

1345 # Check if rating already exists 

1346 existing_rating = ( 

1347 session.query(ResearchRating) 

1348 .filter_by(research_id=research_id) 

1349 .first() 

1350 ) 

1351 

1352 if existing_rating: 

1353 # Update existing rating 

1354 existing_rating.rating = rating_value 

1355 existing_rating.updated_at = func.now() 

1356 else: 

1357 # Create new rating 

1358 new_rating = ResearchRating( 

1359 research_id=research_id, rating=rating_value 

1360 ) 

1361 session.add(new_rating) 

1362 

1363 session.commit() 

1364 

1365 return jsonify( 

1366 { 

1367 "status": "success", 

1368 "message": "Rating saved successfully", 

1369 "rating": rating_value, 

1370 } 

1371 ) 

1372 

1373 except Exception: 

1374 logger.exception("Error saving research rating") 

1375 return ( 

1376 jsonify( 

1377 { 

1378 "status": "error", 

1379 "message": "An internal error occurred. Please try again later.", 

1380 } 

1381 ), 

1382 500, 

1383 ) 

1384 

1385 

1386@metrics_bp.route("/star-reviews") 

1387@login_required 

1388def star_reviews(): 

1389 """Display star reviews metrics page.""" 

1390 return render_template_with_defaults("pages/star_reviews.html") 

1391 

1392 

1393@metrics_bp.route("/costs") 

1394@login_required 

1395def cost_analytics(): 

1396 """Display cost analytics page.""" 

1397 return render_template_with_defaults("pages/cost_analytics.html") 

1398 

1399 

1400@metrics_bp.route("/api/star-reviews") 

1401@login_required 

1402def api_star_reviews(): 

1403 """Get star reviews analytics data.""" 

1404 try: 

1405 username = flask_session["username"] 

1406 

1407 period = request.args.get("period", "30d") 

1408 

1409 with get_user_db_session(username) as session: 

1410 # Build base query with time filter 

1411 base_query = session.query(ResearchRating) 

1412 time_condition = get_time_filter_condition( 

1413 period, ResearchRating.created_at 

1414 ) 

1415 if time_condition is not None: 

1416 base_query = base_query.filter(time_condition) 

1417 

1418 # Overall rating statistics 

1419 overall_stats = session.query( 

1420 func.avg(ResearchRating.rating).label("avg_rating"), 

1421 func.count(ResearchRating.rating).label("total_ratings"), 

1422 func.sum(case((ResearchRating.rating == 5, 1), else_=0)).label( 

1423 "five_star" 

1424 ), 

1425 func.sum(case((ResearchRating.rating == 4, 1), else_=0)).label( 

1426 "four_star" 

1427 ), 

1428 func.sum(case((ResearchRating.rating == 3, 1), else_=0)).label( 

1429 "three_star" 

1430 ), 

1431 func.sum(case((ResearchRating.rating == 2, 1), else_=0)).label( 

1432 "two_star" 

1433 ), 

1434 func.sum(case((ResearchRating.rating == 1, 1), else_=0)).label( 

1435 "one_star" 

1436 ), 

1437 ) 

1438 

1439 if time_condition is not None: 

1440 overall_stats = overall_stats.filter(time_condition) 

1441 

1442 overall_stats = overall_stats.first() 

1443 

1444 # Ratings by LLM model (get from token_usage since Research doesn't have model field) 

1445 llm_ratings_query = session.query( 

1446 func.coalesce(TokenUsage.model_name, "Unknown").label("model"), 

1447 func.avg(ResearchRating.rating).label("avg_rating"), 

1448 func.count(ResearchRating.rating).label("rating_count"), 

1449 func.sum(case((ResearchRating.rating >= 4, 1), else_=0)).label( 

1450 "positive_ratings" 

1451 ), 

1452 ).outerjoin( 

1453 TokenUsage, ResearchRating.research_id == TokenUsage.research_id 

1454 ) 

1455 

1456 if time_condition is not None: 

1457 llm_ratings_query = llm_ratings_query.filter(time_condition) 

1458 

1459 llm_ratings = ( 

1460 llm_ratings_query.group_by(TokenUsage.model_name) 

1461 .order_by(func.avg(ResearchRating.rating).desc()) 

1462 .all() 

1463 ) 

1464 

1465 # Ratings by search engine (join with token_usage to get search engine info) 

1466 search_engine_ratings_query = session.query( 

1467 func.coalesce( 

1468 TokenUsage.search_engine_selected, "Unknown" 

1469 ).label("search_engine"), 

1470 func.avg(ResearchRating.rating).label("avg_rating"), 

1471 func.count(ResearchRating.rating).label("rating_count"), 

1472 func.sum(case((ResearchRating.rating >= 4, 1), else_=0)).label( 

1473 "positive_ratings" 

1474 ), 

1475 ).outerjoin( 

1476 TokenUsage, ResearchRating.research_id == TokenUsage.research_id 

1477 ) 

1478 

1479 if time_condition is not None: 

1480 search_engine_ratings_query = ( 

1481 search_engine_ratings_query.filter(time_condition) 

1482 ) 

1483 

1484 search_engine_ratings = ( 

1485 search_engine_ratings_query.group_by( 

1486 TokenUsage.search_engine_selected 

1487 ) 

1488 .having(func.count(ResearchRating.rating) > 0) 

1489 .order_by(func.avg(ResearchRating.rating).desc()) 

1490 .all() 

1491 ) 

1492 

1493 # Rating trends over time 

1494 rating_trends_query = session.query( 

1495 func.date(ResearchRating.created_at).label("date"), 

1496 func.avg(ResearchRating.rating).label("avg_rating"), 

1497 func.count(ResearchRating.rating).label("daily_count"), 

1498 ) 

1499 

1500 if time_condition is not None: 

1501 rating_trends_query = rating_trends_query.filter(time_condition) 

1502 

1503 rating_trends = ( 

1504 rating_trends_query.group_by( 

1505 func.date(ResearchRating.created_at) 

1506 ) 

1507 .order_by("date") 

1508 .all() 

1509 ) 

1510 

1511 # Recent ratings with research details 

1512 recent_ratings_query = ( 

1513 session.query( 

1514 ResearchRating.rating, 

1515 ResearchRating.created_at, 

1516 ResearchRating.research_id, 

1517 Research.query, 

1518 Research.mode, 

1519 TokenUsage.model_name, 

1520 Research.created_at, 

1521 ) 

1522 .outerjoin(Research, ResearchRating.research_id == Research.id) 

1523 .outerjoin( 

1524 TokenUsage, 

1525 ResearchRating.research_id == TokenUsage.research_id, 

1526 ) 

1527 ) 

1528 

1529 if time_condition is not None: 

1530 recent_ratings_query = recent_ratings_query.filter( 

1531 time_condition 

1532 ) 

1533 

1534 recent_ratings = ( 

1535 recent_ratings_query.order_by(ResearchRating.created_at.desc()) 

1536 .limit(20) 

1537 .all() 

1538 ) 

1539 

1540 return jsonify( 

1541 { 

1542 "overall_stats": { 

1543 "avg_rating": round(overall_stats.avg_rating or 0, 2), 

1544 "total_ratings": overall_stats.total_ratings or 0, 

1545 "rating_distribution": { 

1546 "5": overall_stats.five_star or 0, 

1547 "4": overall_stats.four_star or 0, 

1548 "3": overall_stats.three_star or 0, 

1549 "2": overall_stats.two_star or 0, 

1550 "1": overall_stats.one_star or 0, 

1551 }, 

1552 }, 

1553 "llm_ratings": [ 

1554 { 

1555 "model": rating.model, 

1556 "avg_rating": round(rating.avg_rating or 0, 2), 

1557 "rating_count": rating.rating_count or 0, 

1558 "positive_ratings": rating.positive_ratings or 0, 

1559 "satisfaction_rate": round( 

1560 (rating.positive_ratings or 0) 

1561 / max(rating.rating_count or 1, 1) 

1562 * 100, 

1563 1, 

1564 ), 

1565 } 

1566 for rating in llm_ratings 

1567 ], 

1568 "search_engine_ratings": [ 

1569 { 

1570 "search_engine": rating.search_engine, 

1571 "avg_rating": round(rating.avg_rating or 0, 2), 

1572 "rating_count": rating.rating_count or 0, 

1573 "positive_ratings": rating.positive_ratings or 0, 

1574 "satisfaction_rate": round( 

1575 (rating.positive_ratings or 0) 

1576 / max(rating.rating_count or 1, 1) 

1577 * 100, 

1578 1, 

1579 ), 

1580 } 

1581 for rating in search_engine_ratings 

1582 ], 

1583 "rating_trends": [ 

1584 { 

1585 "date": str(trend.date), 

1586 "avg_rating": round(trend.avg_rating or 0, 2), 

1587 "count": trend.daily_count or 0, 

1588 } 

1589 for trend in rating_trends 

1590 ], 

1591 "recent_ratings": [ 

1592 { 

1593 "rating": rating.rating, 

1594 "created_at": str(rating.created_at), 

1595 "research_id": rating.research_id, 

1596 "query": ( 

1597 rating.query 

1598 if rating.query 

1599 else f"Research Session #{rating.research_id}" 

1600 ), 

1601 "mode": rating.mode 

1602 if rating.mode 

1603 else "Standard Research", 

1604 "llm_model": ( 

1605 rating.model_name 

1606 if rating.model_name 

1607 else "LLM Model" 

1608 ), 

1609 } 

1610 for rating in recent_ratings 

1611 ], 

1612 } 

1613 ) 

1614 

1615 except Exception: 

1616 logger.exception("Error getting star reviews data") 

1617 return ( 

1618 jsonify( 

1619 {"error": "An internal error occurred. Please try again later."} 

1620 ), 

1621 500, 

1622 ) 

1623 

1624 

1625@metrics_bp.route("/api/pricing") 

1626@login_required 

1627def api_pricing(): 

1628 """Get current LLM pricing data.""" 

1629 try: 

1630 from ...metrics.pricing.pricing_fetcher import PricingFetcher 

1631 

1632 # Use static pricing data instead of async 

1633 fetcher = PricingFetcher() 

1634 pricing_data = fetcher.static_pricing 

1635 

1636 return jsonify( 

1637 { 

1638 "status": "success", 

1639 "pricing": pricing_data, 

1640 "last_updated": datetime.now(UTC).isoformat(), 

1641 "note": "Pricing data is from static configuration. Real-time APIs not available for most providers.", 

1642 } 

1643 ) 

1644 

1645 except Exception: 

1646 logger.exception("Error fetching pricing data") 

1647 return jsonify({"error": "Internal Server Error"}), 500 

1648 

1649 

1650@metrics_bp.route("/api/pricing/<model_name>") 

1651@login_required 

1652def api_model_pricing(model_name): 

1653 """Get pricing for a specific model.""" 

1654 try: 

1655 # Optional provider parameter 

1656 provider = request.args.get("provider") 

1657 

1658 from ...metrics.pricing.cost_calculator import CostCalculator 

1659 

1660 # Use synchronous approach with cached/static pricing 

1661 calculator = CostCalculator() 

1662 pricing = calculator.cache.get_model_pricing( 

1663 model_name 

1664 ) or calculator.calculate_cost_sync(model_name, 1000, 1000).get( 

1665 "pricing_used", {} 

1666 ) 

1667 

1668 return jsonify( 

1669 { 

1670 "status": "success", 

1671 "model": model_name, 

1672 "provider": provider, 

1673 "pricing": pricing, 

1674 "last_updated": datetime.now(UTC).isoformat(), 

1675 } 

1676 ) 

1677 

1678 except Exception: 

1679 logger.exception(f"Error getting pricing for model: {model_name}") 

1680 return jsonify({"error": "An internal error occurred"}), 500 

1681 

1682 

1683@metrics_bp.route("/api/cost-calculation", methods=["POST"]) 

1684@login_required 

1685@require_json_body(error_message="No data provided") 

1686def api_cost_calculation(): 

1687 """Calculate cost for token usage.""" 

1688 try: 

1689 data = request.get_json() 

1690 model_name = data.get("model_name") 

1691 provider = data.get("provider") # Optional provider parameter 

1692 prompt_tokens = data.get("prompt_tokens", 0) 

1693 completion_tokens = data.get("completion_tokens", 0) 

1694 

1695 if not model_name: 

1696 return jsonify({"error": "model_name is required"}), 400 

1697 

1698 from ...metrics.pricing.cost_calculator import CostCalculator 

1699 

1700 # Use synchronous cost calculation 

1701 calculator = CostCalculator() 

1702 cost_data = calculator.calculate_cost_sync( 

1703 model_name, prompt_tokens, completion_tokens 

1704 ) 

1705 

1706 return jsonify( 

1707 { 

1708 "status": "success", 

1709 "model_name": model_name, 

1710 "provider": provider, 

1711 "prompt_tokens": prompt_tokens, 

1712 "completion_tokens": completion_tokens, 

1713 "total_tokens": prompt_tokens + completion_tokens, 

1714 **cost_data, 

1715 } 

1716 ) 

1717 

1718 except Exception: 

1719 logger.exception("Error calculating cost") 

1720 return jsonify({"error": "An internal error occurred"}), 500 

1721 

1722 

1723@metrics_bp.route("/api/research-costs/<string:research_id>") 

1724@login_required 

1725def api_research_costs(research_id): 

1726 """Get cost analysis for a specific research session.""" 

1727 try: 

1728 username = flask_session["username"] 

1729 

1730 with get_user_db_session(username) as session: 

1731 # Get token usage records for this research 

1732 usage_records = ( 

1733 session.query(TokenUsage) 

1734 .filter(TokenUsage.research_id == research_id) 

1735 .all() 

1736 ) 

1737 

1738 if not usage_records: 

1739 return jsonify( 

1740 { 

1741 "status": "success", 

1742 "research_id": research_id, 

1743 "total_cost": 0.0, 

1744 "message": "No token usage data found for this research session", 

1745 } 

1746 ) 

1747 

1748 # Convert to dict format for cost calculation 

1749 usage_data = [] 

1750 for record in usage_records: 

1751 usage_data.append( 

1752 { 

1753 "model_name": record.model_name, 

1754 "provider": getattr( 

1755 record, "provider", None 

1756 ), # Handle both old and new records 

1757 "prompt_tokens": record.prompt_tokens, 

1758 "completion_tokens": record.completion_tokens, 

1759 "timestamp": record.timestamp, 

1760 } 

1761 ) 

1762 

1763 from ...metrics.pricing.cost_calculator import CostCalculator 

1764 

1765 # Use synchronous calculation for research costs 

1766 calculator = CostCalculator() 

1767 costs = [] 

1768 for record in usage_data: 

1769 cost_data = calculator.calculate_cost_sync( 

1770 record["model_name"], 

1771 record["prompt_tokens"], 

1772 record["completion_tokens"], 

1773 ) 

1774 costs.append({**record, **cost_data}) 

1775 

1776 total_cost = sum(c["total_cost"] for c in costs) 

1777 total_prompt_tokens = sum(r["prompt_tokens"] for r in usage_data) 

1778 total_completion_tokens = sum( 

1779 r["completion_tokens"] for r in usage_data 

1780 ) 

1781 

1782 cost_summary = { 

1783 "total_cost": round(total_cost, 6), 

1784 "total_tokens": total_prompt_tokens + total_completion_tokens, 

1785 "prompt_tokens": total_prompt_tokens, 

1786 "completion_tokens": total_completion_tokens, 

1787 } 

1788 

1789 return jsonify( 

1790 { 

1791 "status": "success", 

1792 "research_id": research_id, 

1793 **cost_summary, 

1794 } 

1795 ) 

1796 

1797 except Exception: 

1798 logger.exception( 

1799 f"Error getting research costs for research: {research_id}" 

1800 ) 

1801 return jsonify({"error": "An internal error occurred"}), 500 

1802 

1803 

1804@metrics_bp.route("/api/cost-analytics") 

1805@login_required 

1806def api_cost_analytics(): 

1807 """Get cost analytics across all research sessions.""" 

1808 try: 

1809 username = flask_session["username"] 

1810 

1811 period = request.args.get("period", "30d") 

1812 

1813 with get_user_db_session(username) as session: 

1814 # Get token usage for the period 

1815 query = session.query(TokenUsage) 

1816 time_condition = get_time_filter_condition( 

1817 period, TokenUsage.timestamp 

1818 ) 

1819 if time_condition is not None: 

1820 query = query.filter(time_condition) 

1821 

1822 # First check if we have any records to avoid expensive queries 

1823 record_count = query.count() 

1824 

1825 if record_count == 0: 

1826 return jsonify( 

1827 { 

1828 "status": "success", 

1829 "period": period, 

1830 "overview": { 

1831 "total_cost": 0.0, 

1832 "total_tokens": 0, 

1833 "prompt_tokens": 0, 

1834 "completion_tokens": 0, 

1835 }, 

1836 "top_expensive_research": [], 

1837 "research_count": 0, 

1838 "message": "No token usage data found for this period", 

1839 } 

1840 ) 

1841 

1842 # If we have too many records, limit to recent ones to avoid timeout 

1843 if record_count > 1000: 

1844 logger.warning( 

1845 f"Large dataset detected ({record_count} records), limiting to recent 1000 for performance" 

1846 ) 

1847 usage_records = ( 

1848 query.order_by(TokenUsage.timestamp.desc()) 

1849 .limit(1000) 

1850 .all() 

1851 ) 

1852 else: 

1853 usage_records = query.all() 

1854 

1855 # Convert to dict format 

1856 usage_data = [] 

1857 for record in usage_records: 

1858 usage_data.append( 

1859 { 

1860 "model_name": record.model_name, 

1861 "provider": getattr( 

1862 record, "provider", None 

1863 ), # Handle both old and new records 

1864 "prompt_tokens": record.prompt_tokens, 

1865 "completion_tokens": record.completion_tokens, 

1866 "research_id": record.research_id, 

1867 "timestamp": record.timestamp, 

1868 } 

1869 ) 

1870 

1871 from ...metrics.pricing.cost_calculator import CostCalculator 

1872 

1873 # Use synchronous calculation 

1874 calculator = CostCalculator() 

1875 

1876 # Calculate overall costs 

1877 costs = [] 

1878 for record in usage_data: 

1879 cost_data = calculator.calculate_cost_sync( 

1880 record["model_name"], 

1881 record["prompt_tokens"], 

1882 record["completion_tokens"], 

1883 ) 

1884 costs.append({**record, **cost_data}) 

1885 

1886 total_cost = sum(c["total_cost"] for c in costs) 

1887 total_prompt_tokens = sum(r["prompt_tokens"] for r in usage_data) 

1888 total_completion_tokens = sum( 

1889 r["completion_tokens"] for r in usage_data 

1890 ) 

1891 

1892 cost_summary = { 

1893 "total_cost": round(total_cost, 6), 

1894 "total_tokens": total_prompt_tokens + total_completion_tokens, 

1895 "prompt_tokens": total_prompt_tokens, 

1896 "completion_tokens": total_completion_tokens, 

1897 } 

1898 

1899 # Group by research_id for per-research costs 

1900 research_costs: dict[str, Any] = {} 

1901 for record in usage_data: 

1902 rid = record["research_id"] 

1903 if rid not in research_costs: 1903 ↛ 1905line 1903 didn't jump to line 1905 because the condition on line 1903 was always true

1904 research_costs[rid] = [] 

1905 research_costs[rid].append(record) 

1906 

1907 # Calculate cost per research 

1908 research_summaries = {} 

1909 for rid, records in research_costs.items(): 

1910 research_total: float = 0 

1911 for record in records: 

1912 cost_data = calculator.calculate_cost_sync( 

1913 record["model_name"], 

1914 record["prompt_tokens"], 

1915 record["completion_tokens"], 

1916 ) 

1917 research_total += cost_data["total_cost"] 

1918 research_summaries[rid] = { 

1919 "total_cost": round(research_total, 6) 

1920 } 

1921 

1922 # Top expensive research sessions 

1923 top_expensive = sorted( 

1924 [ 

1925 (rid, data["total_cost"]) 

1926 for rid, data in research_summaries.items() 

1927 ], 

1928 key=lambda x: x[1], 

1929 reverse=True, 

1930 )[:10] 

1931 

1932 return jsonify( 

1933 { 

1934 "status": "success", 

1935 "period": period, 

1936 "overview": cost_summary, 

1937 "top_expensive_research": [ 

1938 {"research_id": rid, "total_cost": cost} 

1939 for rid, cost in top_expensive 

1940 ], 

1941 "research_count": len(research_summaries), 

1942 } 

1943 ) 

1944 

1945 except Exception: 

1946 logger.exception("Error getting cost analytics") 

1947 # Return a more graceful error response 

1948 return ( 

1949 jsonify( 

1950 { 

1951 "status": "success", 

1952 "period": period, 

1953 "overview": { 

1954 "total_cost": 0.0, 

1955 "total_tokens": 0, 

1956 "prompt_tokens": 0, 

1957 "completion_tokens": 0, 

1958 }, 

1959 "top_expensive_research": [], 

1960 "research_count": 0, 

1961 "error": "Cost analytics temporarily unavailable", 

1962 } 

1963 ), 

1964 200, 

1965 ) # Return 200 to avoid breaking the UI 

1966 

1967 

1968@metrics_bp.route("/links") 

1969@login_required 

1970def link_analytics(): 

1971 """Display link analytics page.""" 

1972 return render_template_with_defaults("pages/link_analytics.html") 

1973 

1974 

1975@metrics_bp.route("/api/link-analytics") 

1976@login_required 

1977def api_link_analytics(): 

1978 """Get link analytics data.""" 

1979 try: 

1980 username = flask_session["username"] 

1981 

1982 period = request.args.get("period", "30d") 

1983 

1984 # Get link analytics data 

1985 link_data = get_link_analytics(period, username) 

1986 

1987 return jsonify( 

1988 { 

1989 "status": "success", 

1990 "data": link_data["link_analytics"], 

1991 "period": period, 

1992 } 

1993 ) 

1994 

1995 except Exception: 

1996 logger.exception("Error getting link analytics") 

1997 return ( 

1998 jsonify( 

1999 { 

2000 "status": "error", 

2001 "message": "An internal error occurred. Please try again later.", 

2002 } 

2003 ), 

2004 500, 

2005 ) 

2006 

2007 

2008@metrics_bp.route("/api/domain-classifications", methods=["GET"]) 

2009@login_required 

2010def api_get_domain_classifications(): 

2011 """Get all domain classifications.""" 

2012 classifier = None 

2013 try: 

2014 username = flask_session["username"] 

2015 

2016 classifier = DomainClassifier(username) 

2017 classifications = classifier.get_all_classifications() 

2018 

2019 return jsonify( 

2020 { 

2021 "status": "success", 

2022 "classifications": [c.to_dict() for c in classifications], 

2023 "total": len(classifications), 

2024 } 

2025 ) 

2026 

2027 except Exception: 

2028 logger.exception("Error getting domain classifications") 

2029 return jsonify( 

2030 {"status": "error", "message": "Failed to retrieve classifications"} 

2031 ), 500 

2032 finally: 

2033 if classifier is not None: 

2034 from ...utilities.resource_utils import safe_close 

2035 

2036 safe_close(classifier, "domain classifier") 

2037 

2038 

2039@metrics_bp.route("/api/domain-classifications/summary", methods=["GET"]) 

2040@login_required 

2041def api_get_classifications_summary(): 

2042 """Get summary of domain classifications by category.""" 

2043 classifier = None 

2044 try: 

2045 username = flask_session["username"] 

2046 

2047 classifier = DomainClassifier(username) 

2048 summary = classifier.get_categories_summary() 

2049 

2050 return jsonify({"status": "success", "summary": summary}) 

2051 

2052 except Exception: 

2053 logger.exception("Error getting classifications summary") 

2054 return jsonify( 

2055 {"status": "error", "message": "Failed to retrieve summary"} 

2056 ), 500 

2057 finally: 

2058 if classifier is not None: 

2059 from ...utilities.resource_utils import safe_close 

2060 

2061 safe_close(classifier, "domain classifier") 

2062 

2063 

2064@metrics_bp.route("/api/domain-classifications/classify", methods=["POST"]) 

2065@login_required 

2066def api_classify_domains(): 

2067 """Trigger classification of a specific domain or batch classification.""" 

2068 classifier = None 

2069 try: 

2070 username = flask_session["username"] 

2071 

2072 data = request.get_json() or {} 

2073 domain = data.get("domain") 

2074 force_update = data.get("force_update", False) 

2075 batch_mode = data.get("batch", False) 

2076 

2077 # Get settings snapshot for LLM configuration 

2078 from ...settings.manager import SettingsManager 

2079 from ...database.session_context import get_user_db_session 

2080 

2081 with get_user_db_session(username) as db_session: 

2082 settings_manager = SettingsManager(db_session=db_session) 

2083 settings_snapshot = settings_manager.get_all_settings() 

2084 

2085 classifier = DomainClassifier( 

2086 username, settings_snapshot=settings_snapshot 

2087 ) 

2088 

2089 if domain and not batch_mode: 

2090 # Classify single domain 

2091 logger.info(f"Classifying single domain: {domain}") 

2092 classification = classifier.classify_domain(domain, force_update) 

2093 if classification: 

2094 return jsonify( 

2095 { 

2096 "status": "success", 

2097 "classification": classification.to_dict(), 

2098 } 

2099 ) 

2100 return jsonify( 

2101 { 

2102 "status": "error", 

2103 "message": f"Failed to classify domain: {domain}", 

2104 } 

2105 ), 400 

2106 if batch_mode: 

2107 # Batch classification - this should really be a background task 

2108 # For now, we'll just return immediately and let the frontend poll 

2109 logger.info("Starting batch classification of all domains") 

2110 results = classifier.classify_all_domains(force_update) 

2111 

2112 return jsonify({"status": "success", "results": results}) 

2113 return jsonify( 

2114 { 

2115 "status": "error", 

2116 "message": "Must provide either 'domain' or set 'batch': true", 

2117 } 

2118 ), 400 

2119 

2120 except Exception: 

2121 logger.exception("Error classifying domains") 

2122 return jsonify( 

2123 {"status": "error", "message": "Failed to classify domains"} 

2124 ), 500 

2125 finally: 

2126 if classifier is not None: 

2127 from ...utilities.resource_utils import safe_close 

2128 

2129 safe_close(classifier, "domain classifier") 

2130 

2131 

2132@metrics_bp.route("/api/domain-classifications/progress", methods=["GET"]) 

2133@login_required 

2134def api_classification_progress(): 

2135 """Get progress of domain classification task.""" 

2136 try: 

2137 username = flask_session["username"] 

2138 

2139 # Get counts of classified vs unclassified domains 

2140 with get_user_db_session(username) as session: 

2141 # Count total unique domains 

2142 resources = session.query(ResearchResource.url).distinct().all() 

2143 domains = set() 

2144 

2145 for (url,) in resources: 

2146 if url: 

2147 domain = _extract_domain(url) 

2148 if domain: 2148 ↛ 2145line 2148 didn't jump to line 2145 because the condition on line 2148 was always true

2149 domains.add(domain) 

2150 

2151 all_domains = sorted(domains) 

2152 total_domains = len(domains) 

2153 

2154 # Count classified domains 

2155 classified_count = session.query(DomainClassification).count() 

2156 

2157 return jsonify( 

2158 { 

2159 "status": "success", 

2160 "progress": { 

2161 "total_domains": total_domains, 

2162 "classified": classified_count, 

2163 "unclassified": total_domains - classified_count, 

2164 "percentage": round( 

2165 (classified_count / total_domains * 100) 

2166 if total_domains > 0 

2167 else 0, 

2168 1, 

2169 ), 

2170 "all_domains": all_domains, # Return all domains for classification 

2171 }, 

2172 } 

2173 ) 

2174 

2175 except Exception: 

2176 logger.exception("Error getting classification progress") 

2177 return jsonify( 

2178 {"status": "error", "message": "Failed to retrieve progress"} 

2179 ), 500 

2180 

2181 

2182# --------------------------------------------------------------------------- 

2183# Journal Quality Dashboard 

2184# --------------------------------------------------------------------------- 

2185 

2186 

2187@metrics_bp.route("/journals") 

2188@login_required 

2189def journal_quality(): 

2190 """Display journal quality dashboard.""" 

2191 return render_template_with_defaults("pages/journal_quality.html") 

2192 

2193 

2194@metrics_bp.route("/api/journal-data/status") 

2195@login_required 

2196def api_journal_data_status(): 

2197 """Get status of downloadable journal data files.""" 

2198 try: 

2199 from ...journal_quality.downloader import ( 

2200 get_journal_data_status, 

2201 ) 

2202 

2203 return jsonify(get_journal_data_status()) 

2204 except Exception: 

2205 logger.exception("Error checking journal data status") 

2206 return jsonify({"error": "Failed to check status"}), 500 

2207 

2208 

2209@metrics_bp.route("/api/journal-data/download", methods=["POST"]) 

2210@login_required 

2211@journal_data_limit 

2212def api_journal_data_download(): 

2213 """Trigger download/update of journal data files. 

2214 

2215 Rate-limited to 2 per hour per authenticated user: the download streams 

2216 several hundred MB and rebuilds the on-disk reference DB, so unbounded 

2217 invocation is a DoS vector. 

2218 """ 

2219 try: 

2220 from ...journal_quality.downloader import ( 

2221 download_journal_data, 

2222 get_download_state, 

2223 ) 

2224 from ...journal_quality.data_sources import ALL_SOURCES 

2225 

2226 force = request.json.get("force", False) if request.is_json else False 

2227 success, internal_message = download_journal_data(force=force) 

2228 if not success: 

2229 logger.warning(f"Journal data download failed: {internal_message}") 

2230 return jsonify({"success": False, "message": "Download failed"}) 

2231 

2232 # download_journal_data() already calls build_db() + reset_db() 

2233 # internally on its success path (downloader.py:563 → db.py:1209), 

2234 # so the DB is live on disk and the cached engine has been 

2235 # invalidated by the time we get here. Do not add a second build 

2236 # here — it would run the full ~30 s rebuild a second time and 

2237 # write to the legacy `journal_reference.db` filename that the 

2238 # downloader just cleaned up. 

2239 

2240 # Build the user-facing message locally from structured state 

2241 # (ints + developer-authored source labels). We deliberately do 

2242 # NOT echo `internal_message` from download_journal_data: keeping 

2243 # the response safe-by-construction means a future refactor that 

2244 # lets arbitrary strings (exception info, user input, PII) slip 

2245 # into the downloader's message cannot reach the client. 

2246 counts = get_download_state().get("counts") 

2247 if counts is not None: 

2248 parts = [ 

2249 f"{int(counts.get(src.key) or 0)} {src.count_label}" 

2250 for src in ALL_SOURCES 

2251 ] 

2252 user_message = ( 

2253 f"Fetched {' + '.join(parts)}. Database rebuilt successfully." 

2254 ) 

2255 else: 

2256 # `counts` is None when download_journal_data took its 

2257 # early-return "already up to date" branch (no fetch ran). 

2258 user_message = "Journal data is already up to date." 

2259 return jsonify({"success": True, "message": user_message}) 

2260 except Exception: 

2261 logger.exception("Error downloading journal data") 

2262 return jsonify({"success": False, "message": "Download failed"}), 500 

2263 

2264 

2265#: Allowlist of ``score_source`` values accepted by ``/api/journals``. 

2266#: Matches the writer side: ``openalex`` / ``doaj`` for reference-DB 

2267#: hits, ``llm`` for Tier 4 cache rows. Empty string means "no filter" 

2268#: and is handled by the caller before validation. 

2269_ALLOWED_SCORE_SOURCES = frozenset({"openalex", "doaj", "llm"}) 

2270 

2271#: Upper bound on the echoed ``page`` parameter. Prevents a crafted 

2272#: ``?page=10**9`` from issuing an OFFSET scan before the post-query 

2273#: clamp can take effect — reject at input validation instead. 

2274_MAX_PAGE = 10_000 

2275 

2276 

2277@metrics_bp.route("/api/journals") 

2278@login_required 

2279@journals_read_limit 

2280def api_journal_quality(): 

2281 """Get journal quality data with server-side pagination and filtering. 

2282 

2283 Reads from the bundled read-only reference database (~217K journals) 

2284 rather than the per-user DB, so the dashboard is always populated. 

2285 

2286 Query params: 

2287 page (int): 1-indexed page number (default 1, max 10000) 

2288 per_page (int): rows per page, max 200 (default 50) 

2289 search (str): name substring filter 

2290 tier (str): elite/strong/moderate/low/predatory 

2291 score_source (str): openalex/doaj/llm (allowlisted) 

2292 sort (str): column to sort by (default quality) 

2293 order (str): asc or desc (default desc) 

2294 """ 

2295 try: 

2296 from ...journal_quality.db import get_journal_reference_db 

2297 

2298 ref = get_journal_reference_db() 

2299 if not ref.available: 2299 ↛ 2300line 2299 didn't jump to line 2300 because the condition on line 2299 was never true

2300 return jsonify( 

2301 { 

2302 "status": "error", 

2303 "message": "Journal reference database not available.", 

2304 } 

2305 ), 503 

2306 

2307 try: 

2308 page = max(1, int(request.args.get("page", 1))) 

2309 per_page = min(max(1, int(request.args.get("per_page", 50))), 200) 

2310 except (TypeError, ValueError): 

2311 return jsonify( 

2312 { 

2313 "status": "error", 

2314 "message": "Invalid pagination parameters", 

2315 } 

2316 ), 400 

2317 if page > _MAX_PAGE: 

2318 return jsonify( 

2319 { 

2320 "status": "error", 

2321 "message": ( 

2322 f"page exceeds maximum ({_MAX_PAGE}); narrow the " 

2323 "filter or increase per_page" 

2324 ), 

2325 } 

2326 ), 400 

2327 search = request.args.get("search", "") 

2328 tier = request.args.get("tier", "") 

2329 score_source = request.args.get("score_source", "") 

2330 if score_source and score_source not in _ALLOWED_SCORE_SOURCES: 

2331 return jsonify( 

2332 { 

2333 "status": "error", 

2334 "message": ( 

2335 f"Invalid score_source; must be one of " 

2336 f"{sorted(_ALLOWED_SCORE_SOURCES)}" 

2337 ), 

2338 } 

2339 ), 400 

2340 sort = request.args.get("sort", "quality") 

2341 order = request.args.get("order", "desc") 

2342 

2343 journals, total = ref.get_journals_page( 

2344 page=page, 

2345 per_page=per_page, 

2346 search=search, 

2347 tier=tier, 

2348 score_source=score_source, 

2349 sort=sort, 

2350 order=order, 

2351 ) 

2352 

2353 # Clamp the echoed page so the UI never displays out-of-range 

2354 # numbers on crafted input (e.g. ?page=10**9). SQLite's OFFSET on 

2355 # an indexed ORDER BY caps work at ~total rows regardless of the 

2356 # requested offset, so no DB-level clamp is needed. 

2357 total_pages = -(-total // per_page) if per_page > 0 and total > 0 else 1 

2358 page = min(page, total_pages) 

2359 

2360 result = { 

2361 "status": "success", 

2362 "journals": journals, 

2363 "pagination": { 

2364 "page": page, 

2365 "per_page": per_page, 

2366 "total_count": total, 

2367 "total_pages": total_pages, 

2368 }, 

2369 } 

2370 

2371 # Include summary only when requested (avoids 3 extra SQL queries 

2372 # on every pagination/sort/filter request) 

2373 if request.args.get("include_summary", "false") == "true": 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true

2374 summary = ref.get_summary() 

2375 summary["quality_distribution"] = ref.get_quality_distribution() 

2376 summary["source_distribution"] = ref.get_source_distribution() 

2377 result["summary"] = summary 

2378 

2379 return jsonify(result) 

2380 

2381 except Exception: 

2382 logger.exception("Error getting journal quality data") 

2383 return ( 

2384 jsonify( 

2385 { 

2386 "status": "error", 

2387 "message": "An internal error occurred. Please try again later.", 

2388 } 

2389 ), 

2390 500, 

2391 ) 

2392 

2393 

2394def _ref_db_lookup(ref_db, name: str) -> dict: 

2395 """Look up a journal's display bibliometrics in the reference DB. 

2396 

2397 Returns a dict with keys the dashboard template already renders 

2398 (h_index, impact_factor, sjr_quartile, publisher, is_predatory, 

2399 predatory_source, is_in_doaj, has_doaj_seal). Missing fields default 

2400 to None / False so the frontend never sees KeyError. On any ref-DB 

2401 error the function returns an empty dict — the dashboard still shows 

2402 the name + user-DB quality, just without the extras. 

2403 """ 

2404 if ref_db is None or not name: 

2405 return {} 

2406 try: 

2407 entry = ref_db.lookup_source(name=name) or {} 

2408 except Exception: # noqa: silent-exception 

2409 # Reference DB lookups are best-effort enrichment. Any failure 

2410 # degrades to "no bibliometric extras" without crashing the 

2411 # dashboard; detailed errors already surface via the DB layer's 

2412 # own logger.exception calls when they matter. 

2413 return {} 

2414 # lookup_source returns a compact dict; the sjr_quartile lives under 

2415 # "quartile" and predatory/DOAJ fields may be absent entirely. 

2416 return { 

2417 "h_index": entry.get("h_index"), 

2418 "impact_factor": entry.get("impact_factor"), 

2419 "sjr_quartile": entry.get("quartile"), 

2420 "is_predatory": bool(entry.get("is_predatory")), 

2421 "predatory_source": entry.get("predatory_source"), 

2422 "is_in_doaj": bool(entry.get("is_in_doaj")), 

2423 "has_doaj_seal": bool(entry.get("has_doaj_seal")), 

2424 "publisher": entry.get("publisher"), 

2425 } 

2426 

2427 

2428def _get_ref_db_or_none(): 

2429 """Return the JournalQualityDB singleton, or None if unavailable. 

2430 

2431 The reference DB is optional — if the user hasn't downloaded the 

2432 snapshot, the dashboard still renders with user-DB data only. 

2433 """ 

2434 try: 

2435 from ...journal_quality.db import get_journal_reference_db 

2436 

2437 return get_journal_reference_db() 

2438 except Exception: # noqa: silent-exception 

2439 # Reference DB is optional; if import or initialization fails 

2440 # (unusual: usually it's lazily built on first access), the 

2441 # dashboard falls back to user-DB-only rendering. 

2442 return None 

2443 

2444 

2445def _resolve_paper_quality( 

2446 llm_quality: int | None, enrichment: dict 

2447) -> tuple[int | None, str | None]: 

2448 """Pick a quality score for a dashboard row. 

2449 

2450 Precedence: current LLM verdict from the user's ``journals`` table 

2451 (Tier 4 cache, keyed by NFKC-normalized container_title) → live 

2452 derivation from the bundled reference DB row (Tier 1-3). Always 

2453 live — no frozen per-Paper copy exists, so a re-scored journal 

2454 propagates automatically. Returns (score, source_label) or 

2455 (None, None) if neither path had data. 

2456 """ 

2457 if llm_quality is not None: 

2458 return llm_quality, "llm" 

2459 if not enrichment: 

2460 return None, None 

2461 # enrichment comes from _source_to_dashboard_dict — row.quality is 

2462 # the ref-DB's pre-computed score (same formula as the filter uses), 

2463 # so we trust it directly rather than re-running derive_quality_score. 

2464 q = enrichment.get("quality") 

2465 if q is not None: 

2466 return int(q), enrichment.get("score_source") or "openalex" 

2467 return None, None 

2468 

2469 

2470def _lookup_journal_llm_quality( 

2471 db, container_titles: list[str] 

2472) -> dict[str, int]: 

2473 """Batch-look up current Tier 4 LLM verdicts from the user's 

2474 ``journals`` table. 

2475 

2476 Returns a dict mapping ``normalize_name(container_title)`` → 

2477 ``Journal.quality``. Missing journals (never Tier-4-scored) simply 

2478 don't appear in the result — callers fall through to the bundled 

2479 reference DB. One indexed ``name_lower IN (...)`` query. 

2480 """ 

2481 from ...journal_quality.scoring import normalize_name 

2482 

2483 if not container_titles: 

2484 return {} 

2485 normalized = list({normalize_name(ct) for ct in container_titles if ct}) 

2486 if not normalized: 

2487 return {} 

2488 rows = ( 

2489 db.query(Journal.name_lower, Journal.quality) 

2490 .filter(Journal.name_lower.in_(normalized)) 

2491 .filter(Journal.quality.isnot(None)) 

2492 .all() 

2493 ) 

2494 return {name_lower: int(q) for name_lower, q in rows} 

2495 

2496 

2497@metrics_bp.route("/api/journals/user-research") 

2498@login_required 

2499@journals_read_limit 

2500def api_user_research_journals(): 

2501 """Get journals from the user's own research sessions. 

2502 

2503 Paper-rooted query: groups by ``Paper.container_title`` (the 

2504 cleaned name the filter used to score the journal), counts paper 

2505 appearances. Quality is resolved live — Tier 4 via a batch lookup 

2506 against the user's ``journals`` table (keyed by NFKC-normalized 

2507 container_title), Tier 1-3 via the bundled read-only reference DB. 

2508 A re-scored journal propagates to existing research rows 

2509 automatically because no per-Paper score is stored. 

2510 """ 

2511 username = flask_session.get("username") 

2512 if not username: 

2513 return jsonify({"status": "error", "message": "Not authenticated"}), 401 

2514 

2515 _empty_response = { 

2516 "status": "success", 

2517 "summary": { 

2518 "total_journals": 0, 

2519 "avg_quality": None, 

2520 "total_papers": 0, 

2521 "predatory_blocked": 0, 

2522 }, 

2523 "quality_distribution": {}, 

2524 "journals": [], 

2525 } 

2526 

2527 try: 

2528 from sqlalchemy import inspect as sa_inspect 

2529 

2530 with get_user_db_session(username) as db: 

2531 inspector = sa_inspect(db.bind) 

2532 if not inspector.has_table("papers"): 

2533 return jsonify(_empty_response) 

2534 

2535 # Top-200 most-cited journals in this user's research. 

2536 # Orphan Papers (whose ``PaperAppearance`` rows were 

2537 # cascade-deleted when their research session was deleted) 

2538 # are excluded so the dashboard reflects what the user 

2539 # currently has, not residual rows from deleted sessions. 

2540 # See issue #3544. 

2541 rows = ( 

2542 db.query( 

2543 Paper.container_title, 

2544 func.count(Paper.id).label("paper_count"), 

2545 func.min(Paper.year).label("year_min"), 

2546 func.max(Paper.year).label("year_max"), 

2547 ) 

2548 .filter(Paper.container_title.isnot(None)) 

2549 .filter(Paper.appearances.any()) 

2550 .group_by(Paper.container_title) 

2551 .order_by(func.count(Paper.id).desc()) 

2552 .limit(200) 

2553 .all() 

2554 ) 

2555 

2556 if not rows: 

2557 return jsonify(_empty_response) 

2558 

2559 # One batched ref-DB lookup for the whole top-200 slice — 

2560 # hits `sources.name_lower IN (…)` rather than 200 point 

2561 # queries. 

2562 ref_db = _get_ref_db_or_none() 

2563 enrich_map = {} 

2564 if ref_db is not None: 

2565 enrich_map = ref_db.lookup_sources_batch( 

2566 [r.container_title for r in rows] 

2567 ) 

2568 

2569 from ...journal_quality.scoring import normalize_name 

2570 

2571 # Batch-look up current LLM verdicts (Tier 4) from the 

2572 # user's journals table, keyed by NFKC-normalized name. 

2573 # Always live — no frozen Paper copy — so a re-scored 

2574 # journal propagates here without any backfill. 

2575 llm_by_name = _lookup_journal_llm_quality( 

2576 db, [r.container_title for r in rows] 

2577 ) 

2578 

2579 journals: list[dict] = [] 

2580 qualities: list[int] = [] 

2581 for r in rows: 

2582 normalized = normalize_name(r.container_title) 

2583 enrichment = enrich_map.get(normalized, {}) 

2584 quality, source_label = _resolve_paper_quality( 

2585 llm_by_name.get(normalized), enrichment 

2586 ) 

2587 if quality is not None: 

2588 qualities.append(quality) 

2589 journals.append( 

2590 { 

2591 "name": r.container_title, 

2592 "quality": quality, 

2593 "score_source": source_label, 

2594 "paper_count": r.paper_count, 

2595 "year_min": r.year_min, 

2596 "year_max": r.year_max, 

2597 **{ 

2598 k: v 

2599 for k, v in enrichment.items() 

2600 if k not in ("quality", "score_source", "name") 

2601 }, 

2602 } 

2603 ) 

2604 

2605 # Aggregate stats computed across the top-200 slice for the 

2606 # dashboard summary — matches how the table renders. 

2607 total_journals = len(journals) 

2608 total_papers = sum(r.paper_count for r in rows) 

2609 avg_quality = ( 

2610 round(sum(qualities) / len(qualities), 1) if qualities else None 

2611 ) 

2612 quality_distribution: dict[str, int] = {} 

2613 for q in qualities: 

2614 quality_distribution[str(q)] = ( 

2615 quality_distribution.get(str(q), 0) + 1 

2616 ) 

2617 

2618 # Predatory count uses the full set of distinct 

2619 # container_titles across the user's research, not just the 

2620 # top-200 display slice. One batched query. 

2621 # 

2622 # KNOWN-DEFERRED: unbounded SELECT DISTINCT. Acceptable today 

2623 # because typical users have <5K distinct titles even after 

2624 # years of use, and count_predatory_by_names documents 

2625 # support up to ~100K params. Adding .limit(N) was considered 

2626 # and rejected — it would SILENTLY UNDERCOUNT predatory 

2627 # journals, which violates the no-fallbacks rule. Proper fix 

2628 # (cross-DB correlated subquery or TTL cache) is tracked as 

2629 # a post-merge follow-up. Threshold for visible impact: 

2630 # ~50K papers. 

2631 predatory_blocked = 0 

2632 if ref_db is not None: 

2633 # Same orphan-exclusion as the top-200 query above — 

2634 # otherwise predatory_blocked stays inflated by titles 

2635 # whose only Papers belong to deleted research sessions. 

2636 all_names = [ 

2637 name 

2638 for (name,) in db.query(Paper.container_title) 

2639 .filter(Paper.container_title.isnot(None)) 

2640 .filter(Paper.appearances.any()) 

2641 .distinct() 

2642 .all() 

2643 ] 

2644 predatory_blocked = ref_db.count_predatory_by_names(all_names) 

2645 

2646 return jsonify( 

2647 { 

2648 "status": "success", 

2649 "summary": { 

2650 "total_journals": total_journals, 

2651 "avg_quality": avg_quality, 

2652 "total_papers": total_papers, 

2653 "predatory_blocked": predatory_blocked, 

2654 }, 

2655 "quality_distribution": quality_distribution, 

2656 "journals": journals, 

2657 } 

2658 ) 

2659 except Exception: 

2660 logger.exception("Error getting user research journals") 

2661 return ( 

2662 jsonify( 

2663 { 

2664 "status": "error", 

2665 "message": "Failed to load your research data.", 

2666 } 

2667 ), 

2668 500, 

2669 ) 

2670 

2671 

2672@metrics_bp.route("/api/journals/research/<research_id>") 

2673@login_required 

2674@journals_read_limit 

2675def api_research_journals(research_id): 

2676 """Get journals encountered in a single research session. 

2677 

2678 Filters the per-user papers table by joining through 

2679 Paper → PaperAppearance → ResearchResource and matching ``research_id``. 

2680 Quality is resolved live (journals.quality + bundled reference DB) 

2681 so results always reflect the current verdict, not a stale snapshot. 

2682 Mirrors the response shape of /api/journals/user-research so the 

2683 dashboard can reuse its rendering code. 

2684 """ 

2685 username = flask_session.get("username") 

2686 if not username: 2686 ↛ 2687line 2686 didn't jump to line 2687 because the condition on line 2686 was never true

2687 return jsonify({"status": "error", "message": "Not authenticated"}), 401 

2688 

2689 _empty_response = { 

2690 "status": "success", 

2691 "summary": { 

2692 "total_journals": 0, 

2693 "avg_quality": None, 

2694 "total_papers": 0, 

2695 "predatory_blocked": 0, 

2696 }, 

2697 "quality_distribution": {}, 

2698 "journals": [], 

2699 } 

2700 

2701 try: 

2702 from sqlalchemy import inspect as sa_inspect 

2703 

2704 with get_user_db_session(username) as db: 

2705 inspector = sa_inspect(db.bind) 

2706 if not inspector.has_table("papers") or not inspector.has_table( 2706 ↛ 2709line 2706 didn't jump to line 2709 because the condition on line 2706 was never true

2707 "paper_appearances" 

2708 ): 

2709 return jsonify(_empty_response) 

2710 

2711 # Verify the research_id belongs to this user before exposing 

2712 # any data — research_history is in the same per-user DB so 

2713 # the existence check doubles as an ownership check. 

2714 from ...database.models.research import ResearchHistory 

2715 

2716 research = ( 

2717 db.query(ResearchHistory.id) 

2718 .filter(ResearchHistory.id == research_id) 

2719 .first() 

2720 ) 

2721 if research is None: 2721 ↛ 2731line 2721 didn't jump to line 2731 because the condition on line 2721 was always true

2722 return ( 

2723 jsonify( 

2724 {"status": "error", "message": "Research not found"} 

2725 ), 

2726 404, 

2727 ) 

2728 

2729 # Aggregate container_title → paper_count for this research. 

2730 # Join chain: Paper → PaperAppearance → ResearchResource. 

2731 rows = ( 

2732 db.query( 

2733 Paper.container_title, 

2734 func.count(Paper.id).label("paper_count"), 

2735 func.min(Paper.year).label("year_min"), 

2736 func.max(Paper.year).label("year_max"), 

2737 ) 

2738 .join( 

2739 PaperAppearance, 

2740 PaperAppearance.paper_id == Paper.id, 

2741 ) 

2742 .join( 

2743 ResearchResource, 

2744 ResearchResource.id == PaperAppearance.resource_id, 

2745 ) 

2746 .filter( 

2747 ResearchResource.research_id == research_id, 

2748 Paper.container_title.isnot(None), 

2749 ) 

2750 .group_by(Paper.container_title) 

2751 .order_by(func.count(Paper.id).desc()) 

2752 .all() 

2753 ) 

2754 

2755 if not rows: 

2756 return jsonify(_empty_response) 

2757 

2758 ref_db = _get_ref_db_or_none() 

2759 enrich_map = {} 

2760 if ref_db is not None: 

2761 enrich_map = ref_db.lookup_sources_batch( 

2762 [r.container_title for r in rows] 

2763 ) 

2764 

2765 from ...journal_quality.scoring import normalize_name 

2766 

2767 # Batch-look up current LLM verdicts (Tier 4) — see 

2768 # _lookup_journal_llm_quality for rationale. Same live 

2769 # resolution as the cross-research rollup above. 

2770 llm_by_name = _lookup_journal_llm_quality( 

2771 db, [r.container_title for r in rows] 

2772 ) 

2773 

2774 journals: list[dict] = [] 

2775 qualities: list[int] = [] 

2776 predatory_blocked = 0 

2777 for r in rows: 

2778 normalized = normalize_name(r.container_title) 

2779 enrichment = enrich_map.get(normalized, {}) 

2780 if enrichment.get("is_predatory"): 

2781 predatory_blocked += 1 

2782 quality, source_label = _resolve_paper_quality( 

2783 llm_by_name.get(normalized), enrichment 

2784 ) 

2785 if quality is not None: 

2786 qualities.append(quality) 

2787 journals.append( 

2788 { 

2789 "name": r.container_title, 

2790 "quality": quality, 

2791 "score_source": source_label, 

2792 "paper_count": r.paper_count, 

2793 "year_min": r.year_min, 

2794 "year_max": r.year_max, 

2795 **{ 

2796 k: v 

2797 for k, v in enrichment.items() 

2798 if k not in ("quality", "score_source", "name") 

2799 }, 

2800 } 

2801 ) 

2802 

2803 total_papers = sum(r.paper_count for r in rows) 

2804 avg_quality = ( 

2805 round(sum(qualities) / len(qualities), 1) if qualities else None 

2806 ) 

2807 quality_distribution: dict[str, int] = {} 

2808 for q in qualities: 

2809 quality_distribution[str(q)] = ( 

2810 quality_distribution.get(str(q), 0) + 1 

2811 ) 

2812 

2813 return jsonify( 

2814 { 

2815 "status": "success", 

2816 "summary": { 

2817 "total_journals": len(journals), 

2818 "avg_quality": avg_quality, 

2819 "total_papers": total_papers, 

2820 "predatory_blocked": predatory_blocked, 

2821 }, 

2822 "quality_distribution": quality_distribution, 

2823 "journals": journals, 

2824 } 

2825 ) 

2826 except Exception: 

2827 logger.exception("Error getting per-research journals") 

2828 return ( 

2829 jsonify( 

2830 { 

2831 "status": "error", 

2832 "message": "Failed to load research journals.", 

2833 } 

2834 ), 

2835 500, 

2836 )