Coverage for src/local_deep_research/web/routes/metrics_routes.py: 84%
902 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Routes for metrics dashboard."""
3from datetime import datetime, timedelta, UTC
4from typing import Any
5from urllib.parse import urlparse
7from flask import Blueprint, jsonify, request, session as flask_session
8from loguru import logger
9from sqlalchemy import case, func
11from ...database.models import (
12 Journal,
13 Paper,
14 PaperAppearance,
15 RateLimitAttempt,
16 RateLimitEstimate,
17 Research,
18 ResearchHistory,
19 ResearchRating,
20 ResearchResource,
21 ResearchStrategy,
22 TokenUsage,
23)
24from ...constants import get_available_strategies
25from ...domain_classifier import DomainClassifier, DomainClassification
26from ...database.session_context import get_user_db_session
27from ...metrics import TokenCounter
28from ...metrics.query_utils import (
29 get_context_overflow_truncation_summary,
30 get_period_days,
31 get_time_filter_condition,
32)
33from ...metrics.search_tracker import get_search_tracker
34from ...web_search_engines.rate_limiting import get_tracker
35from ...security.decorators import require_json_body
36from ...security.rate_limiter import journal_data_limit, journals_read_limit
37from ..auth.decorators import login_required
38from ..utils.templates import render_template_with_defaults
40# Create a Blueprint for metrics
41metrics_bp = Blueprint("metrics", __name__, url_prefix="/metrics")
43# NOTE: Routes use flask_session["username"] (not .get()) intentionally.
44# @login_required guarantees the key exists; direct access fails fast
45# if the decorator is ever removed.
48def _extract_domain(url):
49 """Extract normalized domain from URL, stripping www. prefix."""
50 try:
51 parsed = urlparse(url)
52 domain = parsed.netloc.lower()
53 if domain.startswith("www."):
54 domain = domain[4:]
55 return domain if domain else None
56 except (ValueError, AttributeError, TypeError):
57 return None
60def get_rating_analytics(period="30d", research_mode="all", username=None):
61 """Get rating analytics for the specified period and research mode."""
62 try:
63 if not username:
64 username = flask_session.get("username")
66 if not username:
67 return {
68 "rating_analytics": {
69 "avg_rating": None,
70 "total_ratings": 0,
71 "rating_distribution": {},
72 "satisfaction_stats": {
73 "very_satisfied": 0,
74 "satisfied": 0,
75 "neutral": 0,
76 "dissatisfied": 0,
77 "very_dissatisfied": 0,
78 },
79 "error": "No user session",
80 }
81 }
83 # Calculate date range
84 days = get_period_days(period)
86 with get_user_db_session(username) as session:
87 query = session.query(ResearchRating)
89 # Apply time filter
90 if days:
91 cutoff_date = datetime.now(UTC) - timedelta(days=days)
92 query = query.filter(ResearchRating.created_at >= cutoff_date)
94 # Get all ratings
95 ratings = query.all()
97 if not ratings:
98 return {
99 "rating_analytics": {
100 "avg_rating": None,
101 "total_ratings": 0,
102 "rating_distribution": {},
103 "satisfaction_stats": {
104 "very_satisfied": 0,
105 "satisfied": 0,
106 "neutral": 0,
107 "dissatisfied": 0,
108 "very_dissatisfied": 0,
109 },
110 }
111 }
113 # Calculate statistics
114 rating_values = [r.rating for r in ratings]
115 avg_rating = sum(rating_values) / len(rating_values)
117 # Rating distribution
118 rating_counts = {}
119 for i in range(1, 6):
120 rating_counts[str(i)] = rating_values.count(i)
122 # Satisfaction categories
123 satisfaction_stats = {
124 "very_satisfied": rating_values.count(5),
125 "satisfied": rating_values.count(4),
126 "neutral": rating_values.count(3),
127 "dissatisfied": rating_values.count(2),
128 "very_dissatisfied": rating_values.count(1),
129 }
131 return {
132 "rating_analytics": {
133 "avg_rating": round(avg_rating, 1),
134 "total_ratings": len(ratings),
135 "rating_distribution": rating_counts,
136 "satisfaction_stats": satisfaction_stats,
137 }
138 }
140 except Exception:
141 logger.exception("Error getting rating analytics")
142 return {
143 "rating_analytics": {
144 "avg_rating": None,
145 "total_ratings": 0,
146 "rating_distribution": {},
147 "satisfaction_stats": {
148 "very_satisfied": 0,
149 "satisfied": 0,
150 "neutral": 0,
151 "dissatisfied": 0,
152 "very_dissatisfied": 0,
153 },
154 }
155 }
158def get_link_analytics(period="30d", username=None):
159 """Get link analytics from research resources."""
160 try:
161 if not username:
162 username = flask_session.get("username")
164 if not username:
165 return {
166 "link_analytics": {
167 "top_domains": [],
168 "total_unique_domains": 0,
169 "avg_links_per_research": 0,
170 "domain_distribution": {},
171 "source_type_analysis": {},
172 "academic_vs_general": {},
173 "total_links": 0,
174 "error": "No user session",
175 }
176 }
178 # Calculate date range
179 days = get_period_days(period)
181 with get_user_db_session(username) as session:
182 # Base query
183 query = session.query(ResearchResource)
185 # Apply time filter
186 if days:
187 cutoff_date = datetime.now(UTC) - timedelta(days=days)
188 query = query.filter(
189 ResearchResource.created_at >= cutoff_date.isoformat()
190 )
192 # Get all resources
193 resources = query.all()
195 if not resources:
196 return {
197 "link_analytics": {
198 "top_domains": [],
199 "total_unique_domains": 0,
200 "avg_links_per_research": 0,
201 "domain_distribution": {},
202 "source_type_analysis": {},
203 "academic_vs_general": {},
204 "total_links": 0,
205 }
206 }
208 # Extract domains from URLs
209 domain_counts: dict[str, Any] = {}
210 domain_researches: dict[
211 str, Any
212 ] = {} # Track which researches used each domain
213 source_types: dict[str, Any] = {}
214 temporal_data: dict[str, Any] = {} # Track links over time
215 domain_connections: dict[
216 str, Any
217 ] = {} # Track domain co-occurrences
219 # Generic category counting from LLM classifications
220 category_counts: dict[str, Any] = {}
222 quality_metrics = {
223 "with_title": 0,
224 "with_preview": 0,
225 "with_both": 0,
226 "total": 0,
227 }
229 # First pass: collect all domains from resources
230 all_domains = set()
231 for resource in resources:
232 if resource.url:
233 domain = _extract_domain(resource.url)
234 if domain: 234 ↛ 231line 234 didn't jump to line 231 because the condition on line 234 was always true
235 all_domains.add(domain)
237 # Batch load all domain classifications in one query (fix N+1)
238 domain_classifications_map = {}
239 if all_domains:
240 all_classifications = (
241 session.query(DomainClassification)
242 .filter(DomainClassification.domain.in_(all_domains))
243 .all()
244 )
245 for classification in all_classifications:
246 domain_classifications_map[classification.domain] = (
247 classification
248 )
250 # Second pass: process resources with pre-loaded classifications
251 for resource in resources:
252 if resource.url:
253 try:
254 domain = _extract_domain(resource.url)
255 if not domain: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 continue
258 # Count domains
259 domain_counts[domain] = domain_counts.get(domain, 0) + 1
261 # Track research IDs for each domain
262 if domain not in domain_researches:
263 domain_researches[domain] = set()
264 domain_researches[domain].add(resource.research_id)
266 # Track temporal data (daily counts)
267 if resource.created_at: 267 ↛ 276line 267 didn't jump to line 276 because the condition on line 267 was always true
268 date_str = resource.created_at[
269 :10
270 ] # Extract YYYY-MM-DD
271 temporal_data[date_str] = (
272 temporal_data.get(date_str, 0) + 1
273 )
275 # Count categories from pre-loaded classifications (no N+1)
276 classification = domain_classifications_map.get(domain)
277 if classification:
278 category = classification.category
279 category_counts[category] = (
280 category_counts.get(category, 0) + 1
281 )
282 else:
283 category_counts["Unclassified"] = (
284 category_counts.get("Unclassified", 0) + 1
285 )
287 # Track source type from metadata if available
288 if resource.source_type:
289 source_types[resource.source_type] = (
290 source_types.get(resource.source_type, 0) + 1
291 )
293 # Track quality metrics
294 quality_metrics["total"] += 1
295 if resource.title:
296 quality_metrics["with_title"] += 1
297 if resource.content_preview:
298 quality_metrics["with_preview"] += 1
299 if resource.title and resource.content_preview:
300 quality_metrics["with_both"] += 1
302 # Track domain co-occurrences for network visualization
303 research_id = resource.research_id
304 if research_id not in domain_connections:
305 domain_connections[research_id] = []
306 domain_connections[research_id].append(domain)
308 except Exception:
309 logger.exception(f"Error parsing URL {resource.url}")
311 # Sort domains by count and get top 10
312 sorted_domains = sorted(
313 domain_counts.items(), key=lambda x: x[1], reverse=True
314 )
315 top_10_domains = sorted_domains[:10]
317 # Calculate domain distribution (top domains vs others)
318 top_10_count = sum(count for _, count in top_10_domains)
319 others_count = len(resources) - top_10_count
321 # Get unique research IDs to calculate average
322 unique_research_ids = {r.research_id for r in resources}
323 avg_links = (
324 len(resources) / len(unique_research_ids)
325 if unique_research_ids
326 else 0
327 )
329 # Prepare temporal trend data (sorted by date)
330 temporal_trend = sorted(
331 [
332 {"date": date, "count": count}
333 for date, count in temporal_data.items()
334 ],
335 key=lambda x: x["date"],
336 )
338 # Get most recent research for each top domain and classifications
339 domain_recent_research = {}
340 # Build domain_classifications dict from pre-loaded data
341 domain_classifications = {
342 domain: {
343 "category": classification.category,
344 "subcategory": classification.subcategory,
345 "confidence": classification.confidence,
346 }
347 for domain, classification in domain_classifications_map.items()
348 }
350 # Batch-load research details for top domains (fix N+1 query)
351 all_research_ids = []
352 domain_research_id_lists = {}
353 for domain, _ in top_10_domains:
354 if domain in domain_researches: 354 ↛ 353line 354 didn't jump to line 353 because the condition on line 354 was always true
355 ids = list(domain_researches[domain])[:3]
356 domain_research_id_lists[domain] = ids
357 all_research_ids.extend(ids)
359 research_by_id = {}
360 if all_research_ids:
361 researches = (
362 session.query(ResearchHistory)
363 .filter(ResearchHistory.id.in_(all_research_ids))
364 .all()
365 )
366 research_by_id = {r.id: r for r in researches}
368 for domain, ids in domain_research_id_lists.items():
369 domain_recent_research[domain] = [
370 {
371 "id": r_id,
372 "query": research_by_id[r_id].query[:50]
373 if research_by_id.get(r_id)
374 and research_by_id[r_id].query
375 else "Research",
376 }
377 for r_id in ids
378 if r_id in research_by_id
379 ]
381 return {
382 "link_analytics": {
383 "top_domains": [
384 {
385 "domain": domain,
386 "count": count,
387 "percentage": round(
388 count / len(resources) * 100, 1
389 ),
390 "research_count": len(
391 domain_researches.get(domain, set())
392 ),
393 "recent_researches": domain_recent_research.get(
394 domain, []
395 ),
396 "classification": domain_classifications.get(
397 domain, None
398 ),
399 }
400 for domain, count in top_10_domains
401 ],
402 "total_unique_domains": len(domain_counts),
403 "avg_links_per_research": round(avg_links, 1),
404 "domain_distribution": {
405 "top_10": top_10_count,
406 "others": others_count,
407 },
408 "source_type_analysis": source_types,
409 "category_distribution": category_counts,
410 # Generic pie chart data - use whatever LLM classifier outputs
411 "domain_categories": category_counts,
412 "total_links": len(resources),
413 "total_researches": len(unique_research_ids),
414 "temporal_trend": temporal_trend,
415 "domain_metrics": {
416 domain: {
417 "usage_count": count,
418 "usage_percentage": round(
419 count / len(resources) * 100, 1
420 ),
421 "research_diversity": len(
422 domain_researches.get(domain, set())
423 ),
424 "frequency_rank": rank + 1,
425 }
426 for rank, (domain, count) in enumerate(top_10_domains)
427 },
428 }
429 }
431 except Exception:
432 logger.exception("Error getting link analytics")
433 return {
434 "link_analytics": {
435 "top_domains": [],
436 "total_unique_domains": 0,
437 "avg_links_per_research": 0,
438 "domain_distribution": {},
439 "source_type_analysis": {},
440 "academic_vs_general": {},
441 "total_links": 0,
442 "error": "Failed to retrieve link analytics",
443 }
444 }
447def get_strategy_analytics(period="30d", username=None):
448 """Get strategy usage analytics for the specified period."""
449 try:
450 if not username:
451 username = flask_session.get("username")
453 if not username:
454 return {
455 "strategy_analytics": {
456 "total_research_with_strategy": 0,
457 "total_research": 0,
458 "most_popular_strategy": None,
459 "strategy_usage": [],
460 "strategy_distribution": {},
461 "available_strategies": get_available_strategies(),
462 "error": "No user session",
463 }
464 }
466 # Calculate date range
467 days = get_period_days(period)
469 with get_user_db_session(username) as session:
470 # Check if we have any ResearchStrategy records
471 strategy_count = session.query(ResearchStrategy).count()
473 if strategy_count == 0:
474 logger.warning("No research strategies found in database")
475 return {
476 "strategy_analytics": {
477 "total_research_with_strategy": 0,
478 "total_research": 0,
479 "most_popular_strategy": None,
480 "strategy_usage": [],
481 "strategy_distribution": {},
482 "available_strategies": get_available_strategies(),
483 "message": "Strategy tracking not yet available - run a research to start tracking",
484 }
485 }
487 # Base query for strategy usage (no JOIN needed since we just want strategy counts)
488 query = session.query(
489 ResearchStrategy.strategy_name,
490 func.count(ResearchStrategy.id).label("usage_count"),
491 )
493 # Apply time filter if specified
494 if days:
495 cutoff_date = datetime.now(UTC) - timedelta(days=days)
496 query = query.filter(ResearchStrategy.created_at >= cutoff_date)
498 # Group by strategy and order by usage
499 strategy_results = (
500 query.group_by(ResearchStrategy.strategy_name)
501 .order_by(func.count(ResearchStrategy.id).desc())
502 .all()
503 )
505 # Get total strategy count for percentage calculation
506 total_query = session.query(ResearchStrategy)
507 if days:
508 total_query = total_query.filter(
509 ResearchStrategy.created_at >= cutoff_date
510 )
511 total_research = total_query.count()
513 # Format strategy data
514 strategy_usage = []
515 strategy_distribution = {}
517 for strategy_name, usage_count in strategy_results:
518 percentage = (
519 (usage_count / total_research * 100)
520 if total_research > 0
521 else 0
522 )
523 strategy_usage.append(
524 {
525 "strategy": strategy_name,
526 "count": usage_count,
527 "percentage": round(percentage, 1),
528 }
529 )
530 strategy_distribution[strategy_name] = usage_count
532 # Find most popular strategy
533 most_popular = (
534 strategy_usage[0]["strategy"] if strategy_usage else None
535 )
537 return {
538 "strategy_analytics": {
539 "total_research_with_strategy": sum(
540 item["count"] for item in strategy_usage
541 ),
542 "total_research": total_research,
543 "most_popular_strategy": most_popular,
544 "strategy_usage": strategy_usage,
545 "strategy_distribution": strategy_distribution,
546 "available_strategies": get_available_strategies(),
547 }
548 }
550 except Exception:
551 logger.exception("Error getting strategy analytics")
552 return {
553 "strategy_analytics": {
554 "total_research_with_strategy": 0,
555 "total_research": 0,
556 "most_popular_strategy": None,
557 "strategy_usage": [],
558 "strategy_distribution": {},
559 "available_strategies": get_available_strategies(),
560 "error": "Failed to retrieve strategy data",
561 }
562 }
565def get_rate_limiting_analytics(period="30d", username=None):
566 """Get rate limiting analytics for the specified period."""
567 try:
568 if not username:
569 username = flask_session.get("username")
571 if not username:
572 return {
573 "rate_limiting": {
574 "total_attempts": 0,
575 "successful_attempts": 0,
576 "failed_attempts": 0,
577 "success_rate": 0,
578 "rate_limit_events": 0,
579 "avg_wait_time": 0,
580 "avg_successful_wait": 0,
581 "tracked_engines": 0,
582 "engine_stats": [],
583 "total_engines_tracked": 0,
584 "healthy_engines": 0,
585 "degraded_engines": 0,
586 "poor_engines": 0,
587 "error": "No user session",
588 }
589 }
591 # Calculate date range for timestamp filtering
592 import time
594 if period == "7d":
595 cutoff_time = time.time() - (7 * 24 * 3600)
596 elif period == "30d":
597 cutoff_time = time.time() - (30 * 24 * 3600)
598 elif period == "3m":
599 cutoff_time = time.time() - (90 * 24 * 3600)
600 elif period == "1y":
601 cutoff_time = time.time() - (365 * 24 * 3600)
602 else: # all
603 cutoff_time = 0
605 with get_user_db_session(username) as session:
606 # Get rate limit attempts
607 rate_limit_query = session.query(RateLimitAttempt)
609 # Apply time filter
610 if cutoff_time > 0:
611 rate_limit_query = rate_limit_query.filter(
612 RateLimitAttempt.timestamp >= cutoff_time
613 )
615 # Get rate limit statistics
616 total_attempts = rate_limit_query.count()
617 successful_attempts = rate_limit_query.filter(
618 RateLimitAttempt.success
619 ).count()
620 failed_attempts = total_attempts - successful_attempts
622 # Count rate limiting events (failures with RateLimitError)
623 rate_limit_events = rate_limit_query.filter(
624 ~RateLimitAttempt.success,
625 RateLimitAttempt.error_type == "RateLimitError",
626 ).count()
628 logger.info(
629 f"Rate limit attempts in database: total={total_attempts}, successful={successful_attempts}"
630 )
632 # Get all attempts for detailed calculations
633 attempts = rate_limit_query.all()
635 # Calculate average wait times
636 if attempts:
637 avg_wait_time = sum(a.wait_time for a in attempts) / len(
638 attempts
639 )
640 successful_wait_times = [
641 a.wait_time for a in attempts if a.success
642 ]
643 avg_successful_wait = (
644 sum(successful_wait_times) / len(successful_wait_times)
645 if successful_wait_times
646 else 0
647 )
648 else:
649 avg_wait_time = 0
650 avg_successful_wait = 0
652 # Get tracked engines - count distinct engine types from attempts
653 tracked_engines_query = session.query(
654 func.count(func.distinct(RateLimitAttempt.engine_type))
655 )
656 if cutoff_time > 0:
657 tracked_engines_query = tracked_engines_query.filter(
658 RateLimitAttempt.timestamp >= cutoff_time
659 )
660 tracked_engines = tracked_engines_query.scalar() or 0
662 # Get engine-specific stats from attempts
663 engine_stats = []
665 # Get distinct engine types from attempts
666 engine_types_query = session.query(
667 RateLimitAttempt.engine_type
668 ).distinct()
669 if cutoff_time > 0:
670 engine_types_query = engine_types_query.filter(
671 RateLimitAttempt.timestamp >= cutoff_time
672 )
673 engine_types = [row.engine_type for row in engine_types_query.all()]
675 # Preload estimates for relevant engines to avoid N+1 queries
676 estimates_by_engine = {}
677 if engine_types:
678 all_estimates = (
679 session.query(RateLimitEstimate)
680 .filter(RateLimitEstimate.engine_type.in_(engine_types))
681 .all()
682 )
683 estimates_by_engine = {e.engine_type: e for e in all_estimates}
685 for engine_type in engine_types:
686 engine_attempts_list = [
687 a for a in attempts if a.engine_type == engine_type
688 ]
689 engine_attempts = len(engine_attempts_list)
690 engine_success = len(
691 [a for a in engine_attempts_list if a.success]
692 )
694 # Get estimate from preloaded dict
695 estimate = estimates_by_engine.get(engine_type)
697 # Calculate recent success rate
698 recent_success_rate = (
699 (engine_success / engine_attempts * 100)
700 if engine_attempts > 0
701 else 0
702 )
704 # Determine status based on success rate
705 if estimate:
706 status = (
707 "healthy"
708 if estimate.success_rate > 0.8
709 else "degraded"
710 if estimate.success_rate > 0.5
711 else "poor"
712 )
713 else:
714 status = (
715 "healthy"
716 if recent_success_rate > 80
717 else "degraded"
718 if recent_success_rate > 50
719 else "poor"
720 )
722 engine_stat = {
723 "engine": engine_type,
724 "base_wait": estimate.base_wait_seconds
725 if estimate
726 else 0.0,
727 "base_wait_seconds": round(
728 estimate.base_wait_seconds if estimate else 0.0, 2
729 ),
730 "min_wait_seconds": round(
731 estimate.min_wait_seconds if estimate else 0.0, 2
732 ),
733 "max_wait_seconds": round(
734 estimate.max_wait_seconds if estimate else 0.0, 2
735 ),
736 "success_rate": round(estimate.success_rate * 100, 1)
737 if estimate
738 else recent_success_rate,
739 "total_attempts": estimate.total_attempts
740 if estimate
741 else engine_attempts,
742 "recent_attempts": engine_attempts,
743 "recent_success_rate": round(recent_success_rate, 1),
744 "attempts": engine_attempts,
745 "status": status,
746 }
748 if estimate:
749 from datetime import datetime
751 engine_stat["last_updated"] = datetime.fromtimestamp(
752 estimate.last_updated, UTC
753 ).isoformat() # ISO format already includes timezone
754 else:
755 engine_stat["last_updated"] = "Never"
757 engine_stats.append(engine_stat)
759 logger.info(
760 f"Tracked engines: {tracked_engines}, engine_stats: {engine_stats}"
761 )
763 result = {
764 "rate_limiting": {
765 "total_attempts": total_attempts,
766 "successful_attempts": successful_attempts,
767 "failed_attempts": failed_attempts,
768 "success_rate": (successful_attempts / total_attempts * 100)
769 if total_attempts > 0
770 else 0,
771 "rate_limit_events": rate_limit_events,
772 "avg_wait_time": round(float(avg_wait_time), 2),
773 "avg_successful_wait": round(float(avg_successful_wait), 2),
774 "tracked_engines": tracked_engines,
775 "engine_stats": engine_stats,
776 "total_engines_tracked": tracked_engines,
777 "healthy_engines": len(
778 [s for s in engine_stats if s["status"] == "healthy"]
779 ),
780 "degraded_engines": len(
781 [s for s in engine_stats if s["status"] == "degraded"]
782 ),
783 "poor_engines": len(
784 [s for s in engine_stats if s["status"] == "poor"]
785 ),
786 }
787 }
789 logger.info(
790 f"DEBUG: Returning rate_limiting_analytics result: {result}"
791 )
792 return result
794 except Exception:
795 logger.exception("Error getting rate limiting analytics")
796 return {
797 "rate_limiting": {
798 "total_attempts": 0,
799 "successful_attempts": 0,
800 "failed_attempts": 0,
801 "success_rate": 0,
802 "rate_limit_events": 0,
803 "avg_wait_time": 0,
804 "avg_successful_wait": 0,
805 "tracked_engines": 0,
806 "engine_stats": [],
807 "total_engines_tracked": 0,
808 "healthy_engines": 0,
809 "degraded_engines": 0,
810 "poor_engines": 0,
811 "error": "An internal error occurred while processing the request.",
812 }
813 }
816@metrics_bp.route("/")
817@login_required
818def metrics_dashboard():
819 """Render the metrics dashboard page."""
820 return render_template_with_defaults("pages/metrics.html")
823@metrics_bp.route("/context-overflow")
824@login_required
825def context_overflow_page():
826 """Context overflow analytics page."""
827 return render_template_with_defaults("pages/context_overflow.html")
830@metrics_bp.route("/api/metrics")
831@login_required
832def api_metrics():
833 """Get overall metrics data."""
834 logger.debug("api_metrics endpoint called")
835 try:
836 # Get username from session
837 username = flask_session["username"]
839 # Get time period and research mode from query parameters
840 period = request.args.get("period", "30d")
841 research_mode = request.args.get("mode", "all")
843 token_counter = TokenCounter()
844 search_tracker = get_search_tracker()
846 # Get both token and search metrics
847 token_metrics = token_counter.get_overall_metrics(
848 period=period, research_mode=research_mode
849 )
850 search_metrics = search_tracker.get_search_metrics(
851 period=period,
852 research_mode=research_mode,
853 username=username,
854 )
856 # Get user satisfaction rating data
857 try:
858 with get_user_db_session(username) as session:
859 # Build base query with time filter
860 ratings_query = session.query(ResearchRating)
861 time_condition = get_time_filter_condition(
862 period, ResearchRating.created_at
863 )
864 if time_condition is not None:
865 ratings_query = ratings_query.filter(time_condition)
867 # Get average rating
868 avg_rating = ratings_query.with_entities(
869 func.avg(ResearchRating.rating).label("avg_rating")
870 ).scalar()
872 # Get total rating count
873 total_ratings = ratings_query.count()
875 user_satisfaction = {
876 "avg_rating": round(avg_rating, 1) if avg_rating else None,
877 "total_ratings": total_ratings,
878 }
879 except Exception:
880 logger.exception("Error getting user satisfaction data")
881 user_satisfaction = {"avg_rating": None, "total_ratings": 0}
883 # Get strategy analytics
884 strategy_data = get_strategy_analytics(period, username)
885 logger.debug(f"strategy_data keys: {list(strategy_data.keys())}")
887 # Get rate limiting analytics
888 rate_limiting_data = get_rate_limiting_analytics(period, username)
889 logger.debug(f"rate_limiting_data: {rate_limiting_data}")
890 logger.debug(
891 f"rate_limiting_data keys: {list(rate_limiting_data.keys())}"
892 )
894 # Truncation summary surfaced on the main dashboard. Failure sentinel
895 # is None (not 0): a real zero means "no truncation", so falling back
896 # to 0 on error would silently flip a red signal green.
897 context_overflow_data = {
898 "truncation_rate": None,
899 "avg_tokens_truncated": None,
900 }
901 try:
902 with get_user_db_session(username) as session:
903 # Honor the dashboard's research_mode filter the same way the
904 # rest of api_metrics() does (token_metrics, search_metrics,
905 # etc.). Without this the panel ignores mode toggles.
906 summary = get_context_overflow_truncation_summary(
907 session, period, research_mode=research_mode
908 )
909 context_overflow_data = {
910 "truncation_rate": round(summary["truncation_rate"], 1),
911 "avg_tokens_truncated": int(summary["avg_tokens_truncated"]),
912 }
913 except Exception:
914 logger.exception(
915 "Error getting context overflow summary for /api/metrics"
916 )
918 # Combine metrics
919 combined_metrics = {
920 **token_metrics,
921 **search_metrics,
922 **strategy_data,
923 **rate_limiting_data,
924 **context_overflow_data,
925 "user_satisfaction": user_satisfaction,
926 }
928 logger.debug(f"combined_metrics keys: {list(combined_metrics.keys())}")
929 logger.debug(
930 f"combined_metrics['rate_limiting']: {combined_metrics.get('rate_limiting', 'NOT FOUND')}"
931 )
933 return jsonify(
934 {
935 "status": "success",
936 "metrics": combined_metrics,
937 "period": period,
938 "research_mode": research_mode,
939 }
940 )
941 except Exception:
942 logger.exception("Error getting metrics")
943 return (
944 jsonify(
945 {
946 "status": "error",
947 "message": "An internal error occurred. Please try again later.",
948 }
949 ),
950 500,
951 )
954@metrics_bp.route("/api/rate-limiting")
955@login_required
956def api_rate_limiting_metrics():
957 """Get detailed rate limiting metrics."""
958 # KNOWN-DEFERRED: debug log left in during development. Not harmful
959 # (no PII, just marks endpoint entry) but noisy — post-merge cleanup.
960 logger.info("DEBUG: api_rate_limiting_metrics endpoint called")
961 try:
962 username = flask_session["username"]
963 period = request.args.get("period", "30d")
964 rate_limiting_data = get_rate_limiting_analytics(period, username)
966 return jsonify(
967 {"status": "success", "data": rate_limiting_data, "period": period}
968 )
969 except Exception:
970 logger.exception("Error getting rate limiting metrics")
971 return jsonify(
972 {
973 "status": "error",
974 "message": "Failed to retrieve rate limiting metrics",
975 }
976 ), 500
979@metrics_bp.route("/api/rate-limiting/current")
980@login_required
981def api_current_rate_limits():
982 """Get current rate limit estimates for all engines."""
983 try:
984 tracker = get_tracker()
985 stats = tracker.get_stats()
987 current_limits = []
988 for stat in stats:
989 (
990 engine_type,
991 base_wait,
992 min_wait,
993 max_wait,
994 last_updated,
995 total_attempts,
996 success_rate,
997 ) = stat
998 current_limits.append(
999 {
1000 "engine_type": engine_type,
1001 "base_wait_seconds": round(base_wait, 2),
1002 "min_wait_seconds": round(min_wait, 2),
1003 "max_wait_seconds": round(max_wait, 2),
1004 "success_rate": round(success_rate * 100, 1),
1005 "total_attempts": total_attempts,
1006 "last_updated": datetime.fromtimestamp(
1007 last_updated, UTC
1008 ).isoformat(), # ISO format already includes timezone
1009 "status": "healthy"
1010 if success_rate > 0.8
1011 else "degraded"
1012 if success_rate > 0.5
1013 else "poor",
1014 }
1015 )
1017 return jsonify(
1018 {
1019 "status": "success",
1020 "current_limits": current_limits,
1021 "timestamp": datetime.now(UTC).isoformat(),
1022 }
1023 )
1024 except Exception:
1025 logger.exception("Error getting current rate limits")
1026 return jsonify(
1027 {
1028 "status": "error",
1029 "message": "Failed to retrieve current rate limits",
1030 }
1031 ), 500
1034@metrics_bp.route("/api/metrics/research/<string:research_id>/links")
1035@login_required
1036def api_research_link_metrics(research_id):
1037 """Get link analytics for a specific research."""
1038 try:
1039 username = flask_session["username"]
1041 with get_user_db_session(username) as session:
1042 # Get all resources for this specific research
1043 resources = (
1044 session.query(ResearchResource)
1045 .filter(ResearchResource.research_id == research_id)
1046 .all()
1047 )
1049 if not resources:
1050 return jsonify(
1051 {
1052 "status": "success",
1053 "data": {
1054 "total_links": 0,
1055 "unique_domains": 0,
1056 "domains": [],
1057 "category_distribution": {},
1058 "domain_categories": {},
1059 "resources": [],
1060 },
1061 }
1062 )
1064 # Extract domain information
1065 domain_counts: dict[str, Any] = {}
1067 # Generic category counting from LLM classifications
1068 category_counts: dict[str, Any] = {}
1070 # First pass: collect all domains
1071 all_domains = set()
1072 for resource in resources:
1073 if resource.url: 1073 ↛ 1072line 1073 didn't jump to line 1072 because the condition on line 1073 was always true
1074 domain = _extract_domain(resource.url)
1075 if domain: 1075 ↛ 1072line 1075 didn't jump to line 1072 because the condition on line 1075 was always true
1076 all_domains.add(domain)
1078 # Batch load all domain classifications in one query (fix N+1)
1079 domain_classifications_map = {}
1080 if all_domains: 1080 ↛ 1092line 1080 didn't jump to line 1092 because the condition on line 1080 was always true
1081 all_classifications = (
1082 session.query(DomainClassification)
1083 .filter(DomainClassification.domain.in_(all_domains))
1084 .all()
1085 )
1086 for classification in all_classifications:
1087 domain_classifications_map[classification.domain] = (
1088 classification
1089 )
1091 # Second pass: process resources with pre-loaded classifications
1092 for resource in resources:
1093 if resource.url: 1093 ↛ 1092line 1093 didn't jump to line 1092 because the condition on line 1093 was always true
1094 try:
1095 domain = _extract_domain(resource.url)
1096 if not domain: 1096 ↛ 1097line 1096 didn't jump to line 1097 because the condition on line 1096 was never true
1097 continue
1099 domain_counts[domain] = domain_counts.get(domain, 0) + 1
1101 # Count categories from pre-loaded classifications (no N+1)
1102 classification = domain_classifications_map.get(domain)
1103 if classification:
1104 category = classification.category
1105 category_counts[category] = (
1106 category_counts.get(category, 0) + 1
1107 )
1108 else:
1109 category_counts["Unclassified"] = (
1110 category_counts.get("Unclassified", 0) + 1
1111 )
1112 except (AttributeError, KeyError) as e:
1113 logger.debug(f"Error classifying domain {domain}: {e}")
1115 # Sort domains by count
1116 sorted_domains = sorted(
1117 domain_counts.items(), key=lambda x: x[1], reverse=True
1118 )
1120 return jsonify(
1121 {
1122 "status": "success",
1123 "data": {
1124 "total_links": len(resources),
1125 "unique_domains": len(domain_counts),
1126 "domains": [
1127 {
1128 "domain": domain,
1129 "count": count,
1130 "percentage": round(
1131 count / len(resources) * 100, 1
1132 ),
1133 }
1134 for domain, count in sorted_domains[
1135 :20
1136 ] # Top 20 domains
1137 ],
1138 "category_distribution": category_counts,
1139 "domain_categories": category_counts, # Generic categories from LLM
1140 "resources": [
1141 {
1142 "title": r.title or "Untitled",
1143 "url": r.url,
1144 "preview": r.content_preview[:200]
1145 if r.content_preview
1146 else None,
1147 }
1148 for r in resources[:10] # First 10 resources
1149 ],
1150 },
1151 }
1152 )
1154 except Exception:
1155 logger.exception("Error getting research link metrics")
1156 return jsonify(
1157 {"status": "error", "message": "Failed to retrieve link metrics"}
1158 ), 500
1161@metrics_bp.route("/api/metrics/research/<string:research_id>")
1162@login_required
1163def api_research_metrics(research_id):
1164 """Get metrics for a specific research."""
1165 try:
1166 token_counter = TokenCounter()
1167 metrics = token_counter.get_research_metrics(research_id)
1168 return jsonify({"status": "success", "metrics": metrics})
1169 except Exception:
1170 logger.exception("Error getting research metrics")
1171 return (
1172 jsonify(
1173 {
1174 "status": "error",
1175 "message": "An internal error occurred. Please try again later.",
1176 }
1177 ),
1178 500,
1179 )
1182@metrics_bp.route("/api/metrics/research/<string:research_id>/timeline")
1183@login_required
1184def api_research_timeline_metrics(research_id):
1185 """Get timeline metrics for a specific research."""
1186 try:
1187 token_counter = TokenCounter()
1188 timeline_metrics = token_counter.get_research_timeline_metrics(
1189 research_id
1190 )
1191 return jsonify({"status": "success", "metrics": timeline_metrics})
1192 except Exception:
1193 logger.exception("Error getting research timeline metrics")
1194 return (
1195 jsonify(
1196 {
1197 "status": "error",
1198 "message": "An internal error occurred. Please try again later.",
1199 }
1200 ),
1201 500,
1202 )
1205@metrics_bp.route("/api/metrics/research/<string:research_id>/search")
1206@login_required
1207def api_research_search_metrics(research_id):
1208 """Get search metrics for a specific research."""
1209 try:
1210 username = flask_session["username"]
1211 search_tracker = get_search_tracker()
1212 search_metrics = search_tracker.get_research_search_metrics(
1213 research_id, username=username
1214 )
1215 return jsonify({"status": "success", "metrics": search_metrics})
1216 except Exception:
1217 logger.exception("Error getting research search metrics")
1218 return (
1219 jsonify(
1220 {
1221 "status": "error",
1222 "message": "An internal error occurred. Please try again later.",
1223 }
1224 ),
1225 500,
1226 )
1229@metrics_bp.route("/api/metrics/enhanced")
1230@login_required
1231def api_enhanced_metrics():
1232 """Get enhanced Phase 1 tracking metrics."""
1233 try:
1234 # Get time period and research mode from query parameters
1235 period = request.args.get("period", "30d")
1236 research_mode = request.args.get("mode", "all")
1237 username = flask_session["username"]
1239 token_counter = TokenCounter()
1240 search_tracker = get_search_tracker()
1242 enhanced_metrics = token_counter.get_enhanced_metrics(
1243 period=period, research_mode=research_mode
1244 )
1246 # Add search time series data for the chart
1247 search_time_series = search_tracker.get_search_time_series(
1248 period=period,
1249 research_mode=research_mode,
1250 username=username,
1251 )
1252 enhanced_metrics["search_time_series"] = search_time_series
1254 # Add rating analytics
1255 rating_analytics = get_rating_analytics(period, research_mode, username)
1256 enhanced_metrics.update(rating_analytics)
1258 return jsonify(
1259 {
1260 "status": "success",
1261 "metrics": enhanced_metrics,
1262 "period": period,
1263 "research_mode": research_mode,
1264 }
1265 )
1266 except Exception:
1267 logger.exception("Error getting enhanced metrics")
1268 return (
1269 jsonify(
1270 {
1271 "status": "error",
1272 "message": "An internal error occurred. Please try again later.",
1273 }
1274 ),
1275 500,
1276 )
1279@metrics_bp.route("/api/ratings/<string:research_id>", methods=["GET"])
1280@login_required
1281def api_get_research_rating(research_id):
1282 """Get rating for a specific research session."""
1283 try:
1284 username = flask_session["username"]
1286 with get_user_db_session(username) as session:
1287 rating = (
1288 session.query(ResearchRating)
1289 .filter_by(research_id=research_id)
1290 .first()
1291 )
1293 if rating:
1294 return jsonify(
1295 {
1296 "status": "success",
1297 "rating": rating.rating,
1298 "created_at": rating.created_at.isoformat(),
1299 "updated_at": rating.updated_at.isoformat(),
1300 }
1301 )
1302 return jsonify({"status": "success", "rating": None})
1304 except Exception:
1305 logger.exception("Error getting research rating")
1306 return (
1307 jsonify(
1308 {
1309 "status": "error",
1310 "message": "An internal error occurred. Please try again later.",
1311 }
1312 ),
1313 500,
1314 )
1317@metrics_bp.route("/api/ratings/<string:research_id>", methods=["POST"])
1318@login_required
1319@require_json_body(error_format="status")
1320def api_save_research_rating(research_id):
1321 """Save or update rating for a specific research session."""
1322 try:
1323 username = flask_session["username"]
1325 data = request.get_json()
1326 rating_value = data.get("rating")
1328 if (
1329 not rating_value
1330 or not isinstance(rating_value, int)
1331 or rating_value < 1
1332 or rating_value > 5
1333 ):
1334 return (
1335 jsonify(
1336 {
1337 "status": "error",
1338 "message": "Rating must be an integer between 1 and 5",
1339 }
1340 ),
1341 400,
1342 )
1344 with get_user_db_session(username) as session:
1345 # Check if rating already exists
1346 existing_rating = (
1347 session.query(ResearchRating)
1348 .filter_by(research_id=research_id)
1349 .first()
1350 )
1352 if existing_rating:
1353 # Update existing rating
1354 existing_rating.rating = rating_value
1355 existing_rating.updated_at = func.now()
1356 else:
1357 # Create new rating
1358 new_rating = ResearchRating(
1359 research_id=research_id, rating=rating_value
1360 )
1361 session.add(new_rating)
1363 session.commit()
1365 return jsonify(
1366 {
1367 "status": "success",
1368 "message": "Rating saved successfully",
1369 "rating": rating_value,
1370 }
1371 )
1373 except Exception:
1374 logger.exception("Error saving research rating")
1375 return (
1376 jsonify(
1377 {
1378 "status": "error",
1379 "message": "An internal error occurred. Please try again later.",
1380 }
1381 ),
1382 500,
1383 )
1386@metrics_bp.route("/star-reviews")
1387@login_required
1388def star_reviews():
1389 """Display star reviews metrics page."""
1390 return render_template_with_defaults("pages/star_reviews.html")
1393@metrics_bp.route("/costs")
1394@login_required
1395def cost_analytics():
1396 """Display cost analytics page."""
1397 return render_template_with_defaults("pages/cost_analytics.html")
1400@metrics_bp.route("/api/star-reviews")
1401@login_required
1402def api_star_reviews():
1403 """Get star reviews analytics data."""
1404 try:
1405 username = flask_session["username"]
1407 period = request.args.get("period", "30d")
1409 with get_user_db_session(username) as session:
1410 # Build base query with time filter
1411 base_query = session.query(ResearchRating)
1412 time_condition = get_time_filter_condition(
1413 period, ResearchRating.created_at
1414 )
1415 if time_condition is not None:
1416 base_query = base_query.filter(time_condition)
1418 # Overall rating statistics
1419 overall_stats = session.query(
1420 func.avg(ResearchRating.rating).label("avg_rating"),
1421 func.count(ResearchRating.rating).label("total_ratings"),
1422 func.sum(case((ResearchRating.rating == 5, 1), else_=0)).label(
1423 "five_star"
1424 ),
1425 func.sum(case((ResearchRating.rating == 4, 1), else_=0)).label(
1426 "four_star"
1427 ),
1428 func.sum(case((ResearchRating.rating == 3, 1), else_=0)).label(
1429 "three_star"
1430 ),
1431 func.sum(case((ResearchRating.rating == 2, 1), else_=0)).label(
1432 "two_star"
1433 ),
1434 func.sum(case((ResearchRating.rating == 1, 1), else_=0)).label(
1435 "one_star"
1436 ),
1437 )
1439 if time_condition is not None:
1440 overall_stats = overall_stats.filter(time_condition)
1442 overall_stats = overall_stats.first()
1444 # Ratings by LLM model (get from token_usage since Research doesn't have model field)
1445 llm_ratings_query = session.query(
1446 func.coalesce(TokenUsage.model_name, "Unknown").label("model"),
1447 func.avg(ResearchRating.rating).label("avg_rating"),
1448 func.count(ResearchRating.rating).label("rating_count"),
1449 func.sum(case((ResearchRating.rating >= 4, 1), else_=0)).label(
1450 "positive_ratings"
1451 ),
1452 ).outerjoin(
1453 TokenUsage, ResearchRating.research_id == TokenUsage.research_id
1454 )
1456 if time_condition is not None:
1457 llm_ratings_query = llm_ratings_query.filter(time_condition)
1459 llm_ratings = (
1460 llm_ratings_query.group_by(TokenUsage.model_name)
1461 .order_by(func.avg(ResearchRating.rating).desc())
1462 .all()
1463 )
1465 # Ratings by search engine (join with token_usage to get search engine info)
1466 search_engine_ratings_query = session.query(
1467 func.coalesce(
1468 TokenUsage.search_engine_selected, "Unknown"
1469 ).label("search_engine"),
1470 func.avg(ResearchRating.rating).label("avg_rating"),
1471 func.count(ResearchRating.rating).label("rating_count"),
1472 func.sum(case((ResearchRating.rating >= 4, 1), else_=0)).label(
1473 "positive_ratings"
1474 ),
1475 ).outerjoin(
1476 TokenUsage, ResearchRating.research_id == TokenUsage.research_id
1477 )
1479 if time_condition is not None:
1480 search_engine_ratings_query = (
1481 search_engine_ratings_query.filter(time_condition)
1482 )
1484 search_engine_ratings = (
1485 search_engine_ratings_query.group_by(
1486 TokenUsage.search_engine_selected
1487 )
1488 .having(func.count(ResearchRating.rating) > 0)
1489 .order_by(func.avg(ResearchRating.rating).desc())
1490 .all()
1491 )
1493 # Rating trends over time
1494 rating_trends_query = session.query(
1495 func.date(ResearchRating.created_at).label("date"),
1496 func.avg(ResearchRating.rating).label("avg_rating"),
1497 func.count(ResearchRating.rating).label("daily_count"),
1498 )
1500 if time_condition is not None:
1501 rating_trends_query = rating_trends_query.filter(time_condition)
1503 rating_trends = (
1504 rating_trends_query.group_by(
1505 func.date(ResearchRating.created_at)
1506 )
1507 .order_by("date")
1508 .all()
1509 )
1511 # Recent ratings with research details
1512 recent_ratings_query = (
1513 session.query(
1514 ResearchRating.rating,
1515 ResearchRating.created_at,
1516 ResearchRating.research_id,
1517 Research.query,
1518 Research.mode,
1519 TokenUsage.model_name,
1520 Research.created_at,
1521 )
1522 .outerjoin(Research, ResearchRating.research_id == Research.id)
1523 .outerjoin(
1524 TokenUsage,
1525 ResearchRating.research_id == TokenUsage.research_id,
1526 )
1527 )
1529 if time_condition is not None:
1530 recent_ratings_query = recent_ratings_query.filter(
1531 time_condition
1532 )
1534 recent_ratings = (
1535 recent_ratings_query.order_by(ResearchRating.created_at.desc())
1536 .limit(20)
1537 .all()
1538 )
1540 return jsonify(
1541 {
1542 "overall_stats": {
1543 "avg_rating": round(overall_stats.avg_rating or 0, 2),
1544 "total_ratings": overall_stats.total_ratings or 0,
1545 "rating_distribution": {
1546 "5": overall_stats.five_star or 0,
1547 "4": overall_stats.four_star or 0,
1548 "3": overall_stats.three_star or 0,
1549 "2": overall_stats.two_star or 0,
1550 "1": overall_stats.one_star or 0,
1551 },
1552 },
1553 "llm_ratings": [
1554 {
1555 "model": rating.model,
1556 "avg_rating": round(rating.avg_rating or 0, 2),
1557 "rating_count": rating.rating_count or 0,
1558 "positive_ratings": rating.positive_ratings or 0,
1559 "satisfaction_rate": round(
1560 (rating.positive_ratings or 0)
1561 / max(rating.rating_count or 1, 1)
1562 * 100,
1563 1,
1564 ),
1565 }
1566 for rating in llm_ratings
1567 ],
1568 "search_engine_ratings": [
1569 {
1570 "search_engine": rating.search_engine,
1571 "avg_rating": round(rating.avg_rating or 0, 2),
1572 "rating_count": rating.rating_count or 0,
1573 "positive_ratings": rating.positive_ratings or 0,
1574 "satisfaction_rate": round(
1575 (rating.positive_ratings or 0)
1576 / max(rating.rating_count or 1, 1)
1577 * 100,
1578 1,
1579 ),
1580 }
1581 for rating in search_engine_ratings
1582 ],
1583 "rating_trends": [
1584 {
1585 "date": str(trend.date),
1586 "avg_rating": round(trend.avg_rating or 0, 2),
1587 "count": trend.daily_count or 0,
1588 }
1589 for trend in rating_trends
1590 ],
1591 "recent_ratings": [
1592 {
1593 "rating": rating.rating,
1594 "created_at": str(rating.created_at),
1595 "research_id": rating.research_id,
1596 "query": (
1597 rating.query
1598 if rating.query
1599 else f"Research Session #{rating.research_id}"
1600 ),
1601 "mode": rating.mode
1602 if rating.mode
1603 else "Standard Research",
1604 "llm_model": (
1605 rating.model_name
1606 if rating.model_name
1607 else "LLM Model"
1608 ),
1609 }
1610 for rating in recent_ratings
1611 ],
1612 }
1613 )
1615 except Exception:
1616 logger.exception("Error getting star reviews data")
1617 return (
1618 jsonify(
1619 {"error": "An internal error occurred. Please try again later."}
1620 ),
1621 500,
1622 )
1625@metrics_bp.route("/api/pricing")
1626@login_required
1627def api_pricing():
1628 """Get current LLM pricing data."""
1629 try:
1630 from ...metrics.pricing.pricing_fetcher import PricingFetcher
1632 # Use static pricing data instead of async
1633 fetcher = PricingFetcher()
1634 pricing_data = fetcher.static_pricing
1636 return jsonify(
1637 {
1638 "status": "success",
1639 "pricing": pricing_data,
1640 "last_updated": datetime.now(UTC).isoformat(),
1641 "note": "Pricing data is from static configuration. Real-time APIs not available for most providers.",
1642 }
1643 )
1645 except Exception:
1646 logger.exception("Error fetching pricing data")
1647 return jsonify({"error": "Internal Server Error"}), 500
1650@metrics_bp.route("/api/pricing/<model_name>")
1651@login_required
1652def api_model_pricing(model_name):
1653 """Get pricing for a specific model."""
1654 try:
1655 # Optional provider parameter
1656 provider = request.args.get("provider")
1658 from ...metrics.pricing.cost_calculator import CostCalculator
1660 # Use synchronous approach with cached/static pricing
1661 calculator = CostCalculator()
1662 pricing = calculator.cache.get_model_pricing(
1663 model_name
1664 ) or calculator.calculate_cost_sync(model_name, 1000, 1000).get(
1665 "pricing_used", {}
1666 )
1668 return jsonify(
1669 {
1670 "status": "success",
1671 "model": model_name,
1672 "provider": provider,
1673 "pricing": pricing,
1674 "last_updated": datetime.now(UTC).isoformat(),
1675 }
1676 )
1678 except Exception:
1679 logger.exception(f"Error getting pricing for model: {model_name}")
1680 return jsonify({"error": "An internal error occurred"}), 500
1683@metrics_bp.route("/api/cost-calculation", methods=["POST"])
1684@login_required
1685@require_json_body(error_message="No data provided")
1686def api_cost_calculation():
1687 """Calculate cost for token usage."""
1688 try:
1689 data = request.get_json()
1690 model_name = data.get("model_name")
1691 provider = data.get("provider") # Optional provider parameter
1692 prompt_tokens = data.get("prompt_tokens", 0)
1693 completion_tokens = data.get("completion_tokens", 0)
1695 if not model_name:
1696 return jsonify({"error": "model_name is required"}), 400
1698 from ...metrics.pricing.cost_calculator import CostCalculator
1700 # Use synchronous cost calculation
1701 calculator = CostCalculator()
1702 cost_data = calculator.calculate_cost_sync(
1703 model_name, prompt_tokens, completion_tokens
1704 )
1706 return jsonify(
1707 {
1708 "status": "success",
1709 "model_name": model_name,
1710 "provider": provider,
1711 "prompt_tokens": prompt_tokens,
1712 "completion_tokens": completion_tokens,
1713 "total_tokens": prompt_tokens + completion_tokens,
1714 **cost_data,
1715 }
1716 )
1718 except Exception:
1719 logger.exception("Error calculating cost")
1720 return jsonify({"error": "An internal error occurred"}), 500
1723@metrics_bp.route("/api/research-costs/<string:research_id>")
1724@login_required
1725def api_research_costs(research_id):
1726 """Get cost analysis for a specific research session."""
1727 try:
1728 username = flask_session["username"]
1730 with get_user_db_session(username) as session:
1731 # Get token usage records for this research
1732 usage_records = (
1733 session.query(TokenUsage)
1734 .filter(TokenUsage.research_id == research_id)
1735 .all()
1736 )
1738 if not usage_records:
1739 return jsonify(
1740 {
1741 "status": "success",
1742 "research_id": research_id,
1743 "total_cost": 0.0,
1744 "message": "No token usage data found for this research session",
1745 }
1746 )
1748 # Convert to dict format for cost calculation
1749 usage_data = []
1750 for record in usage_records:
1751 usage_data.append(
1752 {
1753 "model_name": record.model_name,
1754 "provider": getattr(
1755 record, "provider", None
1756 ), # Handle both old and new records
1757 "prompt_tokens": record.prompt_tokens,
1758 "completion_tokens": record.completion_tokens,
1759 "timestamp": record.timestamp,
1760 }
1761 )
1763 from ...metrics.pricing.cost_calculator import CostCalculator
1765 # Use synchronous calculation for research costs
1766 calculator = CostCalculator()
1767 costs = []
1768 for record in usage_data:
1769 cost_data = calculator.calculate_cost_sync(
1770 record["model_name"],
1771 record["prompt_tokens"],
1772 record["completion_tokens"],
1773 )
1774 costs.append({**record, **cost_data})
1776 total_cost = sum(c["total_cost"] for c in costs)
1777 total_prompt_tokens = sum(r["prompt_tokens"] for r in usage_data)
1778 total_completion_tokens = sum(
1779 r["completion_tokens"] for r in usage_data
1780 )
1782 cost_summary = {
1783 "total_cost": round(total_cost, 6),
1784 "total_tokens": total_prompt_tokens + total_completion_tokens,
1785 "prompt_tokens": total_prompt_tokens,
1786 "completion_tokens": total_completion_tokens,
1787 }
1789 return jsonify(
1790 {
1791 "status": "success",
1792 "research_id": research_id,
1793 **cost_summary,
1794 }
1795 )
1797 except Exception:
1798 logger.exception(
1799 f"Error getting research costs for research: {research_id}"
1800 )
1801 return jsonify({"error": "An internal error occurred"}), 500
1804@metrics_bp.route("/api/cost-analytics")
1805@login_required
1806def api_cost_analytics():
1807 """Get cost analytics across all research sessions."""
1808 try:
1809 username = flask_session["username"]
1811 period = request.args.get("period", "30d")
1813 with get_user_db_session(username) as session:
1814 # Get token usage for the period
1815 query = session.query(TokenUsage)
1816 time_condition = get_time_filter_condition(
1817 period, TokenUsage.timestamp
1818 )
1819 if time_condition is not None:
1820 query = query.filter(time_condition)
1822 # First check if we have any records to avoid expensive queries
1823 record_count = query.count()
1825 if record_count == 0:
1826 return jsonify(
1827 {
1828 "status": "success",
1829 "period": period,
1830 "overview": {
1831 "total_cost": 0.0,
1832 "total_tokens": 0,
1833 "prompt_tokens": 0,
1834 "completion_tokens": 0,
1835 },
1836 "top_expensive_research": [],
1837 "research_count": 0,
1838 "message": "No token usage data found for this period",
1839 }
1840 )
1842 # If we have too many records, limit to recent ones to avoid timeout
1843 if record_count > 1000:
1844 logger.warning(
1845 f"Large dataset detected ({record_count} records), limiting to recent 1000 for performance"
1846 )
1847 usage_records = (
1848 query.order_by(TokenUsage.timestamp.desc())
1849 .limit(1000)
1850 .all()
1851 )
1852 else:
1853 usage_records = query.all()
1855 # Convert to dict format
1856 usage_data = []
1857 for record in usage_records:
1858 usage_data.append(
1859 {
1860 "model_name": record.model_name,
1861 "provider": getattr(
1862 record, "provider", None
1863 ), # Handle both old and new records
1864 "prompt_tokens": record.prompt_tokens,
1865 "completion_tokens": record.completion_tokens,
1866 "research_id": record.research_id,
1867 "timestamp": record.timestamp,
1868 }
1869 )
1871 from ...metrics.pricing.cost_calculator import CostCalculator
1873 # Use synchronous calculation
1874 calculator = CostCalculator()
1876 # Calculate overall costs
1877 costs = []
1878 for record in usage_data:
1879 cost_data = calculator.calculate_cost_sync(
1880 record["model_name"],
1881 record["prompt_tokens"],
1882 record["completion_tokens"],
1883 )
1884 costs.append({**record, **cost_data})
1886 total_cost = sum(c["total_cost"] for c in costs)
1887 total_prompt_tokens = sum(r["prompt_tokens"] for r in usage_data)
1888 total_completion_tokens = sum(
1889 r["completion_tokens"] for r in usage_data
1890 )
1892 cost_summary = {
1893 "total_cost": round(total_cost, 6),
1894 "total_tokens": total_prompt_tokens + total_completion_tokens,
1895 "prompt_tokens": total_prompt_tokens,
1896 "completion_tokens": total_completion_tokens,
1897 }
1899 # Group by research_id for per-research costs
1900 research_costs: dict[str, Any] = {}
1901 for record in usage_data:
1902 rid = record["research_id"]
1903 if rid not in research_costs: 1903 ↛ 1905line 1903 didn't jump to line 1905 because the condition on line 1903 was always true
1904 research_costs[rid] = []
1905 research_costs[rid].append(record)
1907 # Calculate cost per research
1908 research_summaries = {}
1909 for rid, records in research_costs.items():
1910 research_total: float = 0
1911 for record in records:
1912 cost_data = calculator.calculate_cost_sync(
1913 record["model_name"],
1914 record["prompt_tokens"],
1915 record["completion_tokens"],
1916 )
1917 research_total += cost_data["total_cost"]
1918 research_summaries[rid] = {
1919 "total_cost": round(research_total, 6)
1920 }
1922 # Top expensive research sessions
1923 top_expensive = sorted(
1924 [
1925 (rid, data["total_cost"])
1926 for rid, data in research_summaries.items()
1927 ],
1928 key=lambda x: x[1],
1929 reverse=True,
1930 )[:10]
1932 return jsonify(
1933 {
1934 "status": "success",
1935 "period": period,
1936 "overview": cost_summary,
1937 "top_expensive_research": [
1938 {"research_id": rid, "total_cost": cost}
1939 for rid, cost in top_expensive
1940 ],
1941 "research_count": len(research_summaries),
1942 }
1943 )
1945 except Exception:
1946 logger.exception("Error getting cost analytics")
1947 # Return a more graceful error response
1948 return (
1949 jsonify(
1950 {
1951 "status": "success",
1952 "period": period,
1953 "overview": {
1954 "total_cost": 0.0,
1955 "total_tokens": 0,
1956 "prompt_tokens": 0,
1957 "completion_tokens": 0,
1958 },
1959 "top_expensive_research": [],
1960 "research_count": 0,
1961 "error": "Cost analytics temporarily unavailable",
1962 }
1963 ),
1964 200,
1965 ) # Return 200 to avoid breaking the UI
1968@metrics_bp.route("/links")
1969@login_required
1970def link_analytics():
1971 """Display link analytics page."""
1972 return render_template_with_defaults("pages/link_analytics.html")
1975@metrics_bp.route("/api/link-analytics")
1976@login_required
1977def api_link_analytics():
1978 """Get link analytics data."""
1979 try:
1980 username = flask_session["username"]
1982 period = request.args.get("period", "30d")
1984 # Get link analytics data
1985 link_data = get_link_analytics(period, username)
1987 return jsonify(
1988 {
1989 "status": "success",
1990 "data": link_data["link_analytics"],
1991 "period": period,
1992 }
1993 )
1995 except Exception:
1996 logger.exception("Error getting link analytics")
1997 return (
1998 jsonify(
1999 {
2000 "status": "error",
2001 "message": "An internal error occurred. Please try again later.",
2002 }
2003 ),
2004 500,
2005 )
2008@metrics_bp.route("/api/domain-classifications", methods=["GET"])
2009@login_required
2010def api_get_domain_classifications():
2011 """Get all domain classifications."""
2012 classifier = None
2013 try:
2014 username = flask_session["username"]
2016 classifier = DomainClassifier(username)
2017 classifications = classifier.get_all_classifications()
2019 return jsonify(
2020 {
2021 "status": "success",
2022 "classifications": [c.to_dict() for c in classifications],
2023 "total": len(classifications),
2024 }
2025 )
2027 except Exception:
2028 logger.exception("Error getting domain classifications")
2029 return jsonify(
2030 {"status": "error", "message": "Failed to retrieve classifications"}
2031 ), 500
2032 finally:
2033 if classifier is not None:
2034 from ...utilities.resource_utils import safe_close
2036 safe_close(classifier, "domain classifier")
2039@metrics_bp.route("/api/domain-classifications/summary", methods=["GET"])
2040@login_required
2041def api_get_classifications_summary():
2042 """Get summary of domain classifications by category."""
2043 classifier = None
2044 try:
2045 username = flask_session["username"]
2047 classifier = DomainClassifier(username)
2048 summary = classifier.get_categories_summary()
2050 return jsonify({"status": "success", "summary": summary})
2052 except Exception:
2053 logger.exception("Error getting classifications summary")
2054 return jsonify(
2055 {"status": "error", "message": "Failed to retrieve summary"}
2056 ), 500
2057 finally:
2058 if classifier is not None:
2059 from ...utilities.resource_utils import safe_close
2061 safe_close(classifier, "domain classifier")
2064@metrics_bp.route("/api/domain-classifications/classify", methods=["POST"])
2065@login_required
2066def api_classify_domains():
2067 """Trigger classification of a specific domain or batch classification."""
2068 classifier = None
2069 try:
2070 username = flask_session["username"]
2072 data = request.get_json() or {}
2073 domain = data.get("domain")
2074 force_update = data.get("force_update", False)
2075 batch_mode = data.get("batch", False)
2077 # Get settings snapshot for LLM configuration
2078 from ...settings.manager import SettingsManager
2079 from ...database.session_context import get_user_db_session
2081 with get_user_db_session(username) as db_session:
2082 settings_manager = SettingsManager(db_session=db_session)
2083 settings_snapshot = settings_manager.get_all_settings()
2085 classifier = DomainClassifier(
2086 username, settings_snapshot=settings_snapshot
2087 )
2089 if domain and not batch_mode:
2090 # Classify single domain
2091 logger.info(f"Classifying single domain: {domain}")
2092 classification = classifier.classify_domain(domain, force_update)
2093 if classification:
2094 return jsonify(
2095 {
2096 "status": "success",
2097 "classification": classification.to_dict(),
2098 }
2099 )
2100 return jsonify(
2101 {
2102 "status": "error",
2103 "message": f"Failed to classify domain: {domain}",
2104 }
2105 ), 400
2106 if batch_mode:
2107 # Batch classification - this should really be a background task
2108 # For now, we'll just return immediately and let the frontend poll
2109 logger.info("Starting batch classification of all domains")
2110 results = classifier.classify_all_domains(force_update)
2112 return jsonify({"status": "success", "results": results})
2113 return jsonify(
2114 {
2115 "status": "error",
2116 "message": "Must provide either 'domain' or set 'batch': true",
2117 }
2118 ), 400
2120 except Exception:
2121 logger.exception("Error classifying domains")
2122 return jsonify(
2123 {"status": "error", "message": "Failed to classify domains"}
2124 ), 500
2125 finally:
2126 if classifier is not None:
2127 from ...utilities.resource_utils import safe_close
2129 safe_close(classifier, "domain classifier")
2132@metrics_bp.route("/api/domain-classifications/progress", methods=["GET"])
2133@login_required
2134def api_classification_progress():
2135 """Get progress of domain classification task."""
2136 try:
2137 username = flask_session["username"]
2139 # Get counts of classified vs unclassified domains
2140 with get_user_db_session(username) as session:
2141 # Count total unique domains
2142 resources = session.query(ResearchResource.url).distinct().all()
2143 domains = set()
2145 for (url,) in resources:
2146 if url:
2147 domain = _extract_domain(url)
2148 if domain: 2148 ↛ 2145line 2148 didn't jump to line 2145 because the condition on line 2148 was always true
2149 domains.add(domain)
2151 all_domains = sorted(domains)
2152 total_domains = len(domains)
2154 # Count classified domains
2155 classified_count = session.query(DomainClassification).count()
2157 return jsonify(
2158 {
2159 "status": "success",
2160 "progress": {
2161 "total_domains": total_domains,
2162 "classified": classified_count,
2163 "unclassified": total_domains - classified_count,
2164 "percentage": round(
2165 (classified_count / total_domains * 100)
2166 if total_domains > 0
2167 else 0,
2168 1,
2169 ),
2170 "all_domains": all_domains, # Return all domains for classification
2171 },
2172 }
2173 )
2175 except Exception:
2176 logger.exception("Error getting classification progress")
2177 return jsonify(
2178 {"status": "error", "message": "Failed to retrieve progress"}
2179 ), 500
2182# ---------------------------------------------------------------------------
2183# Journal Quality Dashboard
2184# ---------------------------------------------------------------------------
2187@metrics_bp.route("/journals")
2188@login_required
2189def journal_quality():
2190 """Display journal quality dashboard."""
2191 return render_template_with_defaults("pages/journal_quality.html")
2194@metrics_bp.route("/api/journal-data/status")
2195@login_required
2196def api_journal_data_status():
2197 """Get status of downloadable journal data files."""
2198 try:
2199 from ...journal_quality.downloader import (
2200 get_journal_data_status,
2201 )
2203 return jsonify(get_journal_data_status())
2204 except Exception:
2205 logger.exception("Error checking journal data status")
2206 return jsonify({"error": "Failed to check status"}), 500
2209@metrics_bp.route("/api/journal-data/download", methods=["POST"])
2210@login_required
2211@journal_data_limit
2212def api_journal_data_download():
2213 """Trigger download/update of journal data files.
2215 Rate-limited to 2 per hour per authenticated user: the download streams
2216 several hundred MB and rebuilds the on-disk reference DB, so unbounded
2217 invocation is a DoS vector.
2218 """
2219 try:
2220 from ...journal_quality.downloader import (
2221 download_journal_data,
2222 get_download_state,
2223 )
2224 from ...journal_quality.data_sources import ALL_SOURCES
2226 force = request.json.get("force", False) if request.is_json else False
2227 success, internal_message = download_journal_data(force=force)
2228 if not success:
2229 logger.warning(f"Journal data download failed: {internal_message}")
2230 return jsonify({"success": False, "message": "Download failed"})
2232 # download_journal_data() already calls build_db() + reset_db()
2233 # internally on its success path (downloader.py:563 → db.py:1209),
2234 # so the DB is live on disk and the cached engine has been
2235 # invalidated by the time we get here. Do not add a second build
2236 # here — it would run the full ~30 s rebuild a second time and
2237 # write to the legacy `journal_reference.db` filename that the
2238 # downloader just cleaned up.
2240 # Build the user-facing message locally from structured state
2241 # (ints + developer-authored source labels). We deliberately do
2242 # NOT echo `internal_message` from download_journal_data: keeping
2243 # the response safe-by-construction means a future refactor that
2244 # lets arbitrary strings (exception info, user input, PII) slip
2245 # into the downloader's message cannot reach the client.
2246 counts = get_download_state().get("counts")
2247 if counts is not None:
2248 parts = [
2249 f"{int(counts.get(src.key) or 0)} {src.count_label}"
2250 for src in ALL_SOURCES
2251 ]
2252 user_message = (
2253 f"Fetched {' + '.join(parts)}. Database rebuilt successfully."
2254 )
2255 else:
2256 # `counts` is None when download_journal_data took its
2257 # early-return "already up to date" branch (no fetch ran).
2258 user_message = "Journal data is already up to date."
2259 return jsonify({"success": True, "message": user_message})
2260 except Exception:
2261 logger.exception("Error downloading journal data")
2262 return jsonify({"success": False, "message": "Download failed"}), 500
2265#: Allowlist of ``score_source`` values accepted by ``/api/journals``.
2266#: Matches the writer side: ``openalex`` / ``doaj`` for reference-DB
2267#: hits, ``llm`` for Tier 4 cache rows. Empty string means "no filter"
2268#: and is handled by the caller before validation.
2269_ALLOWED_SCORE_SOURCES = frozenset({"openalex", "doaj", "llm"})
2271#: Upper bound on the echoed ``page`` parameter. Prevents a crafted
2272#: ``?page=10**9`` from issuing an OFFSET scan before the post-query
2273#: clamp can take effect — reject at input validation instead.
2274_MAX_PAGE = 10_000
2277@metrics_bp.route("/api/journals")
2278@login_required
2279@journals_read_limit
2280def api_journal_quality():
2281 """Get journal quality data with server-side pagination and filtering.
2283 Reads from the bundled read-only reference database (~217K journals)
2284 rather than the per-user DB, so the dashboard is always populated.
2286 Query params:
2287 page (int): 1-indexed page number (default 1, max 10000)
2288 per_page (int): rows per page, max 200 (default 50)
2289 search (str): name substring filter
2290 tier (str): elite/strong/moderate/low/predatory
2291 score_source (str): openalex/doaj/llm (allowlisted)
2292 sort (str): column to sort by (default quality)
2293 order (str): asc or desc (default desc)
2294 """
2295 try:
2296 from ...journal_quality.db import get_journal_reference_db
2298 ref = get_journal_reference_db()
2299 if not ref.available: 2299 ↛ 2300line 2299 didn't jump to line 2300 because the condition on line 2299 was never true
2300 return jsonify(
2301 {
2302 "status": "error",
2303 "message": "Journal reference database not available.",
2304 }
2305 ), 503
2307 try:
2308 page = max(1, int(request.args.get("page", 1)))
2309 per_page = min(max(1, int(request.args.get("per_page", 50))), 200)
2310 except (TypeError, ValueError):
2311 return jsonify(
2312 {
2313 "status": "error",
2314 "message": "Invalid pagination parameters",
2315 }
2316 ), 400
2317 if page > _MAX_PAGE:
2318 return jsonify(
2319 {
2320 "status": "error",
2321 "message": (
2322 f"page exceeds maximum ({_MAX_PAGE}); narrow the "
2323 "filter or increase per_page"
2324 ),
2325 }
2326 ), 400
2327 search = request.args.get("search", "")
2328 tier = request.args.get("tier", "")
2329 score_source = request.args.get("score_source", "")
2330 if score_source and score_source not in _ALLOWED_SCORE_SOURCES:
2331 return jsonify(
2332 {
2333 "status": "error",
2334 "message": (
2335 f"Invalid score_source; must be one of "
2336 f"{sorted(_ALLOWED_SCORE_SOURCES)}"
2337 ),
2338 }
2339 ), 400
2340 sort = request.args.get("sort", "quality")
2341 order = request.args.get("order", "desc")
2343 journals, total = ref.get_journals_page(
2344 page=page,
2345 per_page=per_page,
2346 search=search,
2347 tier=tier,
2348 score_source=score_source,
2349 sort=sort,
2350 order=order,
2351 )
2353 # Clamp the echoed page so the UI never displays out-of-range
2354 # numbers on crafted input (e.g. ?page=10**9). SQLite's OFFSET on
2355 # an indexed ORDER BY caps work at ~total rows regardless of the
2356 # requested offset, so no DB-level clamp is needed.
2357 total_pages = -(-total // per_page) if per_page > 0 and total > 0 else 1
2358 page = min(page, total_pages)
2360 result = {
2361 "status": "success",
2362 "journals": journals,
2363 "pagination": {
2364 "page": page,
2365 "per_page": per_page,
2366 "total_count": total,
2367 "total_pages": total_pages,
2368 },
2369 }
2371 # Include summary only when requested (avoids 3 extra SQL queries
2372 # on every pagination/sort/filter request)
2373 if request.args.get("include_summary", "false") == "true": 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true
2374 summary = ref.get_summary()
2375 summary["quality_distribution"] = ref.get_quality_distribution()
2376 summary["source_distribution"] = ref.get_source_distribution()
2377 result["summary"] = summary
2379 return jsonify(result)
2381 except Exception:
2382 logger.exception("Error getting journal quality data")
2383 return (
2384 jsonify(
2385 {
2386 "status": "error",
2387 "message": "An internal error occurred. Please try again later.",
2388 }
2389 ),
2390 500,
2391 )
2394def _ref_db_lookup(ref_db, name: str) -> dict:
2395 """Look up a journal's display bibliometrics in the reference DB.
2397 Returns a dict with keys the dashboard template already renders
2398 (h_index, impact_factor, sjr_quartile, publisher, is_predatory,
2399 predatory_source, is_in_doaj, has_doaj_seal). Missing fields default
2400 to None / False so the frontend never sees KeyError. On any ref-DB
2401 error the function returns an empty dict — the dashboard still shows
2402 the name + user-DB quality, just without the extras.
2403 """
2404 if ref_db is None or not name:
2405 return {}
2406 try:
2407 entry = ref_db.lookup_source(name=name) or {}
2408 except Exception: # noqa: silent-exception
2409 # Reference DB lookups are best-effort enrichment. Any failure
2410 # degrades to "no bibliometric extras" without crashing the
2411 # dashboard; detailed errors already surface via the DB layer's
2412 # own logger.exception calls when they matter.
2413 return {}
2414 # lookup_source returns a compact dict; the sjr_quartile lives under
2415 # "quartile" and predatory/DOAJ fields may be absent entirely.
2416 return {
2417 "h_index": entry.get("h_index"),
2418 "impact_factor": entry.get("impact_factor"),
2419 "sjr_quartile": entry.get("quartile"),
2420 "is_predatory": bool(entry.get("is_predatory")),
2421 "predatory_source": entry.get("predatory_source"),
2422 "is_in_doaj": bool(entry.get("is_in_doaj")),
2423 "has_doaj_seal": bool(entry.get("has_doaj_seal")),
2424 "publisher": entry.get("publisher"),
2425 }
2428def _get_ref_db_or_none():
2429 """Return the JournalQualityDB singleton, or None if unavailable.
2431 The reference DB is optional — if the user hasn't downloaded the
2432 snapshot, the dashboard still renders with user-DB data only.
2433 """
2434 try:
2435 from ...journal_quality.db import get_journal_reference_db
2437 return get_journal_reference_db()
2438 except Exception: # noqa: silent-exception
2439 # Reference DB is optional; if import or initialization fails
2440 # (unusual: usually it's lazily built on first access), the
2441 # dashboard falls back to user-DB-only rendering.
2442 return None
2445def _resolve_paper_quality(
2446 llm_quality: int | None, enrichment: dict
2447) -> tuple[int | None, str | None]:
2448 """Pick a quality score for a dashboard row.
2450 Precedence: current LLM verdict from the user's ``journals`` table
2451 (Tier 4 cache, keyed by NFKC-normalized container_title) → live
2452 derivation from the bundled reference DB row (Tier 1-3). Always
2453 live — no frozen per-Paper copy exists, so a re-scored journal
2454 propagates automatically. Returns (score, source_label) or
2455 (None, None) if neither path had data.
2456 """
2457 if llm_quality is not None:
2458 return llm_quality, "llm"
2459 if not enrichment:
2460 return None, None
2461 # enrichment comes from _source_to_dashboard_dict — row.quality is
2462 # the ref-DB's pre-computed score (same formula as the filter uses),
2463 # so we trust it directly rather than re-running derive_quality_score.
2464 q = enrichment.get("quality")
2465 if q is not None:
2466 return int(q), enrichment.get("score_source") or "openalex"
2467 return None, None
2470def _lookup_journal_llm_quality(
2471 db, container_titles: list[str]
2472) -> dict[str, int]:
2473 """Batch-look up current Tier 4 LLM verdicts from the user's
2474 ``journals`` table.
2476 Returns a dict mapping ``normalize_name(container_title)`` →
2477 ``Journal.quality``. Missing journals (never Tier-4-scored) simply
2478 don't appear in the result — callers fall through to the bundled
2479 reference DB. One indexed ``name_lower IN (...)`` query.
2480 """
2481 from ...journal_quality.scoring import normalize_name
2483 if not container_titles:
2484 return {}
2485 normalized = list({normalize_name(ct) for ct in container_titles if ct})
2486 if not normalized:
2487 return {}
2488 rows = (
2489 db.query(Journal.name_lower, Journal.quality)
2490 .filter(Journal.name_lower.in_(normalized))
2491 .filter(Journal.quality.isnot(None))
2492 .all()
2493 )
2494 return {name_lower: int(q) for name_lower, q in rows}
2497@metrics_bp.route("/api/journals/user-research")
2498@login_required
2499@journals_read_limit
2500def api_user_research_journals():
2501 """Get journals from the user's own research sessions.
2503 Paper-rooted query: groups by ``Paper.container_title`` (the
2504 cleaned name the filter used to score the journal), counts paper
2505 appearances. Quality is resolved live — Tier 4 via a batch lookup
2506 against the user's ``journals`` table (keyed by NFKC-normalized
2507 container_title), Tier 1-3 via the bundled read-only reference DB.
2508 A re-scored journal propagates to existing research rows
2509 automatically because no per-Paper score is stored.
2510 """
2511 username = flask_session.get("username")
2512 if not username:
2513 return jsonify({"status": "error", "message": "Not authenticated"}), 401
2515 _empty_response = {
2516 "status": "success",
2517 "summary": {
2518 "total_journals": 0,
2519 "avg_quality": None,
2520 "total_papers": 0,
2521 "predatory_blocked": 0,
2522 },
2523 "quality_distribution": {},
2524 "journals": [],
2525 }
2527 try:
2528 from sqlalchemy import inspect as sa_inspect
2530 with get_user_db_session(username) as db:
2531 inspector = sa_inspect(db.bind)
2532 if not inspector.has_table("papers"):
2533 return jsonify(_empty_response)
2535 # Top-200 most-cited journals in this user's research.
2536 # Orphan Papers (whose ``PaperAppearance`` rows were
2537 # cascade-deleted when their research session was deleted)
2538 # are excluded so the dashboard reflects what the user
2539 # currently has, not residual rows from deleted sessions.
2540 # See issue #3544.
2541 rows = (
2542 db.query(
2543 Paper.container_title,
2544 func.count(Paper.id).label("paper_count"),
2545 func.min(Paper.year).label("year_min"),
2546 func.max(Paper.year).label("year_max"),
2547 )
2548 .filter(Paper.container_title.isnot(None))
2549 .filter(Paper.appearances.any())
2550 .group_by(Paper.container_title)
2551 .order_by(func.count(Paper.id).desc())
2552 .limit(200)
2553 .all()
2554 )
2556 if not rows:
2557 return jsonify(_empty_response)
2559 # One batched ref-DB lookup for the whole top-200 slice —
2560 # hits `sources.name_lower IN (…)` rather than 200 point
2561 # queries.
2562 ref_db = _get_ref_db_or_none()
2563 enrich_map = {}
2564 if ref_db is not None:
2565 enrich_map = ref_db.lookup_sources_batch(
2566 [r.container_title for r in rows]
2567 )
2569 from ...journal_quality.scoring import normalize_name
2571 # Batch-look up current LLM verdicts (Tier 4) from the
2572 # user's journals table, keyed by NFKC-normalized name.
2573 # Always live — no frozen Paper copy — so a re-scored
2574 # journal propagates here without any backfill.
2575 llm_by_name = _lookup_journal_llm_quality(
2576 db, [r.container_title for r in rows]
2577 )
2579 journals: list[dict] = []
2580 qualities: list[int] = []
2581 for r in rows:
2582 normalized = normalize_name(r.container_title)
2583 enrichment = enrich_map.get(normalized, {})
2584 quality, source_label = _resolve_paper_quality(
2585 llm_by_name.get(normalized), enrichment
2586 )
2587 if quality is not None:
2588 qualities.append(quality)
2589 journals.append(
2590 {
2591 "name": r.container_title,
2592 "quality": quality,
2593 "score_source": source_label,
2594 "paper_count": r.paper_count,
2595 "year_min": r.year_min,
2596 "year_max": r.year_max,
2597 **{
2598 k: v
2599 for k, v in enrichment.items()
2600 if k not in ("quality", "score_source", "name")
2601 },
2602 }
2603 )
2605 # Aggregate stats computed across the top-200 slice for the
2606 # dashboard summary — matches how the table renders.
2607 total_journals = len(journals)
2608 total_papers = sum(r.paper_count for r in rows)
2609 avg_quality = (
2610 round(sum(qualities) / len(qualities), 1) if qualities else None
2611 )
2612 quality_distribution: dict[str, int] = {}
2613 for q in qualities:
2614 quality_distribution[str(q)] = (
2615 quality_distribution.get(str(q), 0) + 1
2616 )
2618 # Predatory count uses the full set of distinct
2619 # container_titles across the user's research, not just the
2620 # top-200 display slice. One batched query.
2621 #
2622 # KNOWN-DEFERRED: unbounded SELECT DISTINCT. Acceptable today
2623 # because typical users have <5K distinct titles even after
2624 # years of use, and count_predatory_by_names documents
2625 # support up to ~100K params. Adding .limit(N) was considered
2626 # and rejected — it would SILENTLY UNDERCOUNT predatory
2627 # journals, which violates the no-fallbacks rule. Proper fix
2628 # (cross-DB correlated subquery or TTL cache) is tracked as
2629 # a post-merge follow-up. Threshold for visible impact:
2630 # ~50K papers.
2631 predatory_blocked = 0
2632 if ref_db is not None:
2633 # Same orphan-exclusion as the top-200 query above —
2634 # otherwise predatory_blocked stays inflated by titles
2635 # whose only Papers belong to deleted research sessions.
2636 all_names = [
2637 name
2638 for (name,) in db.query(Paper.container_title)
2639 .filter(Paper.container_title.isnot(None))
2640 .filter(Paper.appearances.any())
2641 .distinct()
2642 .all()
2643 ]
2644 predatory_blocked = ref_db.count_predatory_by_names(all_names)
2646 return jsonify(
2647 {
2648 "status": "success",
2649 "summary": {
2650 "total_journals": total_journals,
2651 "avg_quality": avg_quality,
2652 "total_papers": total_papers,
2653 "predatory_blocked": predatory_blocked,
2654 },
2655 "quality_distribution": quality_distribution,
2656 "journals": journals,
2657 }
2658 )
2659 except Exception:
2660 logger.exception("Error getting user research journals")
2661 return (
2662 jsonify(
2663 {
2664 "status": "error",
2665 "message": "Failed to load your research data.",
2666 }
2667 ),
2668 500,
2669 )
2672@metrics_bp.route("/api/journals/research/<research_id>")
2673@login_required
2674@journals_read_limit
2675def api_research_journals(research_id):
2676 """Get journals encountered in a single research session.
2678 Filters the per-user papers table by joining through
2679 Paper → PaperAppearance → ResearchResource and matching ``research_id``.
2680 Quality is resolved live (journals.quality + bundled reference DB)
2681 so results always reflect the current verdict, not a stale snapshot.
2682 Mirrors the response shape of /api/journals/user-research so the
2683 dashboard can reuse its rendering code.
2684 """
2685 username = flask_session.get("username")
2686 if not username: 2686 ↛ 2687line 2686 didn't jump to line 2687 because the condition on line 2686 was never true
2687 return jsonify({"status": "error", "message": "Not authenticated"}), 401
2689 _empty_response = {
2690 "status": "success",
2691 "summary": {
2692 "total_journals": 0,
2693 "avg_quality": None,
2694 "total_papers": 0,
2695 "predatory_blocked": 0,
2696 },
2697 "quality_distribution": {},
2698 "journals": [],
2699 }
2701 try:
2702 from sqlalchemy import inspect as sa_inspect
2704 with get_user_db_session(username) as db:
2705 inspector = sa_inspect(db.bind)
2706 if not inspector.has_table("papers") or not inspector.has_table( 2706 ↛ 2709line 2706 didn't jump to line 2709 because the condition on line 2706 was never true
2707 "paper_appearances"
2708 ):
2709 return jsonify(_empty_response)
2711 # Verify the research_id belongs to this user before exposing
2712 # any data — research_history is in the same per-user DB so
2713 # the existence check doubles as an ownership check.
2714 from ...database.models.research import ResearchHistory
2716 research = (
2717 db.query(ResearchHistory.id)
2718 .filter(ResearchHistory.id == research_id)
2719 .first()
2720 )
2721 if research is None: 2721 ↛ 2731line 2721 didn't jump to line 2731 because the condition on line 2721 was always true
2722 return (
2723 jsonify(
2724 {"status": "error", "message": "Research not found"}
2725 ),
2726 404,
2727 )
2729 # Aggregate container_title → paper_count for this research.
2730 # Join chain: Paper → PaperAppearance → ResearchResource.
2731 rows = (
2732 db.query(
2733 Paper.container_title,
2734 func.count(Paper.id).label("paper_count"),
2735 func.min(Paper.year).label("year_min"),
2736 func.max(Paper.year).label("year_max"),
2737 )
2738 .join(
2739 PaperAppearance,
2740 PaperAppearance.paper_id == Paper.id,
2741 )
2742 .join(
2743 ResearchResource,
2744 ResearchResource.id == PaperAppearance.resource_id,
2745 )
2746 .filter(
2747 ResearchResource.research_id == research_id,
2748 Paper.container_title.isnot(None),
2749 )
2750 .group_by(Paper.container_title)
2751 .order_by(func.count(Paper.id).desc())
2752 .all()
2753 )
2755 if not rows:
2756 return jsonify(_empty_response)
2758 ref_db = _get_ref_db_or_none()
2759 enrich_map = {}
2760 if ref_db is not None:
2761 enrich_map = ref_db.lookup_sources_batch(
2762 [r.container_title for r in rows]
2763 )
2765 from ...journal_quality.scoring import normalize_name
2767 # Batch-look up current LLM verdicts (Tier 4) — see
2768 # _lookup_journal_llm_quality for rationale. Same live
2769 # resolution as the cross-research rollup above.
2770 llm_by_name = _lookup_journal_llm_quality(
2771 db, [r.container_title for r in rows]
2772 )
2774 journals: list[dict] = []
2775 qualities: list[int] = []
2776 predatory_blocked = 0
2777 for r in rows:
2778 normalized = normalize_name(r.container_title)
2779 enrichment = enrich_map.get(normalized, {})
2780 if enrichment.get("is_predatory"):
2781 predatory_blocked += 1
2782 quality, source_label = _resolve_paper_quality(
2783 llm_by_name.get(normalized), enrichment
2784 )
2785 if quality is not None:
2786 qualities.append(quality)
2787 journals.append(
2788 {
2789 "name": r.container_title,
2790 "quality": quality,
2791 "score_source": source_label,
2792 "paper_count": r.paper_count,
2793 "year_min": r.year_min,
2794 "year_max": r.year_max,
2795 **{
2796 k: v
2797 for k, v in enrichment.items()
2798 if k not in ("quality", "score_source", "name")
2799 },
2800 }
2801 )
2803 total_papers = sum(r.paper_count for r in rows)
2804 avg_quality = (
2805 round(sum(qualities) / len(qualities), 1) if qualities else None
2806 )
2807 quality_distribution: dict[str, int] = {}
2808 for q in qualities:
2809 quality_distribution[str(q)] = (
2810 quality_distribution.get(str(q), 0) + 1
2811 )
2813 return jsonify(
2814 {
2815 "status": "success",
2816 "summary": {
2817 "total_journals": len(journals),
2818 "avg_quality": avg_quality,
2819 "total_papers": total_papers,
2820 "predatory_blocked": predatory_blocked,
2821 },
2822 "quality_distribution": quality_distribution,
2823 "journals": journals,
2824 }
2825 )
2826 except Exception:
2827 logger.exception("Error getting per-research journals")
2828 return (
2829 jsonify(
2830 {
2831 "status": "error",
2832 "message": "Failed to load research journals.",
2833 }
2834 ),
2835 500,
2836 )