Coverage for src / local_deep_research / research_library / routes / library_routes.py: 42%
555 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Routes for Research Library and Download Manager
4Provides web endpoints for:
5- Library browsing and management
6- Download manager interface
7- API endpoints for downloads and queries
8"""
10import json
11from io import BytesIO
12from pathlib import Path
13from urllib.parse import urlparse
14from flask import (
15 Blueprint,
16 g,
17 jsonify,
18 request,
19 session,
20 Response,
21 send_file,
22 stream_with_context,
23)
24from loguru import logger
26from ...web.auth.decorators import login_required
27from ...web.utils.templates import render_template_with_defaults
28from ...database.session_context import get_user_db_session
29from ...database.models.research import ResearchResource
30from ...database.models.library import (
31 Document as Document,
32 DocumentStatus,
33 DownloadQueue as LibraryDownloadQueue,
34 Collection,
35)
36from ...library.download_management import ResourceFilter
37from ..services.download_service import DownloadService
38from ..services.library_service import LibraryService
39from ..services.pdf_storage_manager import PDFStorageManager
40from ..utils import handle_api_error
41from ...utilities.db_utils import get_settings_manager
42from ...config.paths import get_library_directory
44# Create Blueprint
45library_bp = Blueprint("library", __name__, url_prefix="/library")
48# Error handler for authentication errors
49@library_bp.errorhandler(Exception)
50def handle_web_api_exception(error):
51 """Handle WebAPIException and its subclasses."""
52 from ...web.exceptions import WebAPIException
54 if isinstance(error, WebAPIException):
55 return jsonify(error.to_dict()), error.status_code
56 # Re-raise other exceptions
57 raise error
60def is_downloadable_domain(url: str) -> bool:
61 """Check if URL is from a downloadable academic domain using proper URL parsing."""
62 try:
63 if not url:
64 return False
66 parsed = urlparse(url.lower())
67 hostname = parsed.hostname or ""
68 path = parsed.path or ""
69 query = parsed.query or ""
71 # Check for direct PDF files
72 if path.endswith(".pdf") or ".pdf?" in url.lower():
73 return True
75 # List of downloadable academic domains
76 downloadable_domains = [
77 "arxiv.org",
78 "biorxiv.org",
79 "medrxiv.org",
80 "ncbi.nlm.nih.gov",
81 "pubmed.ncbi.nlm.nih.gov",
82 "europepmc.org",
83 "semanticscholar.org",
84 "researchgate.net",
85 "academia.edu",
86 "sciencedirect.com",
87 "springer.com",
88 "nature.com",
89 "wiley.com",
90 "ieee.org",
91 "acm.org",
92 "plos.org",
93 "frontiersin.org",
94 "mdpi.com",
95 "acs.org",
96 "rsc.org",
97 "tandfonline.com",
98 "sagepub.com",
99 "oxford.com",
100 "cambridge.org",
101 "bmj.com",
102 "nejm.org",
103 "thelancet.com",
104 "jamanetwork.com",
105 "annals.org",
106 "ahajournals.org",
107 "cell.com",
108 "science.org",
109 "pnas.org",
110 "elifesciences.org",
111 "embopress.org",
112 "journals.asm.org",
113 "microbiologyresearch.org",
114 "jvi.asm.org",
115 "genome.cshlp.org",
116 "genetics.org",
117 "g3journal.org",
118 "plantphysiol.org",
119 "plantcell.org",
120 "aspb.org",
121 "bioone.org",
122 "company-of-biologists.org",
123 "biologists.org",
124 "jeb.biologists.org",
125 "dmm.biologists.org",
126 "bio.biologists.org",
127 "doi.org",
128 "ssrn.com",
129 "openreview.net",
130 ]
132 # Check if hostname matches any downloadable domain
133 for domain in downloadable_domains:
134 if hostname == domain or hostname.endswith("." + domain):
135 return True
137 # Special case for PubMed which might appear in path
138 if "pubmed" in hostname or "/pubmed/" in path: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 return True
141 # Check for PDF in path or query parameters
142 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query:
143 return True
145 return False
147 except Exception as e:
148 logger.warning(f"Error parsing URL {url}: {e}")
149 return False
152def get_authenticated_user_password(
153 username: str, flask_session_id: str = None
154) -> str:
155 """
156 Get authenticated user password from session store with fallback to g.user_password.
158 Args:
159 username: The username to get password for
160 flask_session_id: Optional Flask session ID. If not provided, uses session.get("_id")
162 Returns:
163 str: The user's password
165 Raises:
166 AuthenticationRequiredError: If no password is available for the user
167 """
168 from ...database.session_passwords import session_password_store
169 from ...web.exceptions import AuthenticationRequiredError
171 session_id = flask_session_id or session.get("session_id")
173 # Try session password store first
174 try:
175 user_password = session_password_store.get_session_password(
176 username, session_id
177 )
178 if user_password:
179 logger.debug(
180 f"Retrieved user password from session store for user {username}"
181 )
182 return user_password
183 except Exception:
184 logger.exception("Failed to get user password from session store")
186 # Fallback to g.user_password (set by middleware if temp_auth was used)
187 user_password = getattr(g, "user_password", None)
188 if user_password:
189 logger.debug(
190 f"Retrieved user password from g.user_password fallback for user {username}"
191 )
192 return user_password
194 # No password available
195 logger.error(f"No user password available for user {username}")
196 raise AuthenticationRequiredError(
197 message="Authentication required: Please refresh the page and log in again to access encrypted database features.",
198 username=username,
199 )
202# ============= Page Routes =============
205@library_bp.route("/")
206@login_required
207def library_page():
208 """Main library page showing downloaded documents."""
209 username = session.get("username")
210 service = LibraryService(username)
212 # Get library settings
213 from ...utilities.db_utils import get_settings_manager
215 settings = get_settings_manager()
216 pdf_storage_mode = settings.get_setting(
217 "research_library.pdf_storage_mode", "database"
218 )
219 # Enable PDF storage button if mode is not "none"
220 enable_pdf_storage = pdf_storage_mode != "none"
221 shared_library = settings.get_setting(
222 "research_library.shared_library", False
223 )
225 # Get statistics
226 stats = service.get_library_stats()
228 # Get documents with optional filters
229 domain_filter = request.args.get("domain")
230 research_filter = request.args.get("research")
231 collection_filter = request.args.get("collection") # New collection filter
233 documents = service.get_documents(
234 research_id=research_filter,
235 domain=domain_filter,
236 collection_id=collection_filter,
237 limit=100,
238 )
240 # Get unique domains for filter dropdown
241 unique_domains = service.get_unique_domains()
243 # Get research list for filter dropdown
244 research_list = service.get_research_list_with_stats()
246 # Get collections list for filter dropdown
247 collections = service.get_all_collections()
249 return render_template_with_defaults(
250 "pages/library.html",
251 stats=stats,
252 documents=documents,
253 unique_domains=unique_domains,
254 research_list=research_list,
255 collections=collections,
256 selected_collection=collection_filter,
257 storage_path=stats.get("storage_path", ""),
258 enable_pdf_storage=enable_pdf_storage,
259 pdf_storage_mode=pdf_storage_mode,
260 shared_library=shared_library,
261 )
264@library_bp.route("/document/<string:document_id>")
265@login_required
266def document_details_page(document_id):
267 """Document details page showing all metadata and links."""
268 username = session.get("username")
269 service = LibraryService(username)
271 # Get document details
272 document = service.get_document_by_id(document_id)
274 if not document: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true
275 return "Document not found", 404
277 return render_template_with_defaults(
278 "pages/document_details.html", document=document
279 )
282@library_bp.route("/download-manager")
283@login_required
284def download_manager_page():
285 """Download manager page for selecting and downloading research PDFs."""
286 username = session.get("username")
287 service = LibraryService(username)
289 # Get library settings
290 from ...utilities.db_utils import get_settings_manager
292 settings = get_settings_manager()
293 pdf_storage_mode = settings.get_setting(
294 "research_library.pdf_storage_mode", "database"
295 )
296 # Enable PDF storage button if mode is not "none"
297 enable_pdf_storage = pdf_storage_mode != "none"
298 shared_library = settings.get_setting(
299 "research_library.shared_library", False
300 )
302 # Get research sessions with statistics
303 research_list = service.get_research_list_with_stats()
305 # Calculate summary statistics
306 total_researches = len(research_list)
307 total_resources = sum(r["total_resources"] for r in research_list)
308 already_downloaded = sum(r["downloaded_count"] for r in research_list)
309 available_to_download = (
310 sum(r["downloadable_count"] for r in research_list) - already_downloaded
311 )
313 # Enrich research data with domain breakdowns
314 for research in research_list: 314 ↛ 316line 314 didn't jump to line 316 because the loop on line 314 never started
315 # Get PDF sources for this research
316 documents = service.get_documents(
317 research_id=research["id"], file_type="pdf"
318 )
319 research["pdf_sources"] = documents[:10] # Preview first 10
321 # Domain statistics
322 domains = {}
323 for doc in documents:
324 domain = doc.get("domain", "unknown")
325 if domain not in domains:
326 domains[domain] = {"total": 0, "pdfs": 0, "downloaded": 0}
327 domains[domain]["total"] += 1
328 if doc["file_type"] == "pdf":
329 domains[domain]["pdfs"] += 1
330 if doc["download_status"] == "completed":
331 domains[domain]["downloaded"] += 1
333 research["domains"] = domains
335 return render_template_with_defaults(
336 "pages/download_manager.html",
337 research_list=research_list,
338 total_researches=total_researches,
339 total_resources=total_resources,
340 already_downloaded=already_downloaded,
341 available_to_download=available_to_download,
342 enable_pdf_storage=enable_pdf_storage,
343 pdf_storage_mode=pdf_storage_mode,
344 shared_library=shared_library,
345 )
348# ============= API Routes =============
351@library_bp.route("/api/stats")
352@login_required
353def get_library_stats():
354 """Get library statistics."""
355 username = session.get("username")
356 service = LibraryService(username)
357 stats = service.get_library_stats()
358 return jsonify(stats)
361@library_bp.route("/api/collections/list")
362@login_required
363def get_collections_list():
364 """Get list of all collections for dropdown selection."""
365 username = session.get("username")
367 with get_user_db_session(username) as db_session:
368 collections = (
369 db_session.query(Collection).order_by(Collection.name).all()
370 )
372 return jsonify(
373 {
374 "success": True,
375 "collections": [
376 {
377 "id": col.id,
378 "name": col.name,
379 "description": col.description,
380 }
381 for col in collections
382 ],
383 }
384 )
387@library_bp.route("/api/documents")
388@login_required
389def get_documents():
390 """Get documents with filtering."""
391 username = session.get("username")
392 service = LibraryService(username)
394 # Get filter parameters
395 research_id = request.args.get("research_id")
396 domain = request.args.get("domain")
397 file_type = request.args.get("file_type")
398 favorites_only = request.args.get("favorites") == "true"
399 search_query = request.args.get("search")
400 limit = int(request.args.get("limit", 100))
401 offset = int(request.args.get("offset", 0))
403 documents = service.get_documents(
404 research_id=research_id,
405 domain=domain,
406 file_type=file_type,
407 favorites_only=favorites_only,
408 search_query=search_query,
409 limit=limit,
410 offset=offset,
411 )
413 return jsonify({"documents": documents})
416@library_bp.route(
417 "/api/document/<string:document_id>/favorite", methods=["POST"]
418)
419@login_required
420def toggle_favorite(document_id):
421 """Toggle favorite status of a document."""
422 username = session.get("username")
423 service = LibraryService(username)
424 is_favorite = service.toggle_favorite(document_id)
425 return jsonify({"favorite": is_favorite})
428@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"])
429@login_required
430def delete_document(document_id):
431 """Delete a document from library."""
432 username = session.get("username")
433 service = LibraryService(username)
434 success = service.delete_document(document_id)
435 return jsonify({"success": success})
438@library_bp.route("/api/document/<string:document_id>/pdf-url")
439@login_required
440def get_pdf_url(document_id):
441 """Get URL for viewing PDF."""
442 # Return URL that will serve the PDF
443 return jsonify(
444 {
445 "url": f"/library/api/document/{document_id}/pdf",
446 "title": "Document", # Could fetch actual title
447 }
448 )
451@library_bp.route("/document/<string:document_id>/pdf")
452@login_required
453def view_pdf_page(document_id):
454 """Page for viewing PDF file - uses PDFStorageManager for retrieval."""
455 username = session.get("username")
457 with get_user_db_session(username) as db_session:
458 # Get document from database
459 document = db_session.query(Document).filter_by(id=document_id).first()
461 if not document:
462 logger.warning(
463 f"Document ID {document_id} not found in database for user {username}"
464 )
465 return "Document not found", 404
467 logger.info(
468 f"Document {document_id}: title='{document.title}', "
469 f"file_path={document.file_path}"
470 )
472 # Get settings for PDF storage manager
473 settings = get_settings_manager(db_session)
474 storage_mode = settings.get_setting(
475 "research_library.pdf_storage_mode", "none"
476 )
477 library_root = Path(
478 settings.get_setting(
479 "research_library.storage_path",
480 str(get_library_directory()),
481 )
482 ).expanduser()
484 # Use PDFStorageManager to load PDF (handles database and filesystem)
485 pdf_manager = PDFStorageManager(library_root, storage_mode)
486 pdf_bytes = pdf_manager.load_pdf(document, db_session)
488 if pdf_bytes:
489 logger.info(
490 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)"
491 )
492 return send_file(
493 BytesIO(pdf_bytes),
494 mimetype="application/pdf",
495 as_attachment=False,
496 download_name=document.filename or "document.pdf",
497 )
499 # No PDF found anywhere
500 logger.warning(f"No PDF available for document {document_id}")
501 return "PDF not available", 404
504@library_bp.route("/api/document/<string:document_id>/pdf")
505@login_required
506def serve_pdf_api(document_id):
507 """API endpoint for serving PDF file (kept for backward compatibility)."""
508 return view_pdf_page(document_id)
511@library_bp.route("/document/<string:document_id>/txt")
512@login_required
513def view_text_page(document_id):
514 """Page for viewing text content."""
515 username = session.get("username")
517 with get_user_db_session(username) as db_session:
518 # Get document by ID (text now stored in Document.text_content)
519 document = db_session.query(Document).filter_by(id=document_id).first()
521 if not document:
522 logger.warning(f"Document not found for document ID {document_id}")
523 return "Document not found", 404
525 if not document.text_content:
526 logger.warning(f"Document {document_id} has no text content")
527 return "Text content not available", 404
529 logger.info(
530 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
531 )
533 # Render as HTML page
534 return render_template_with_defaults(
535 "pages/document_text.html",
536 document_id=document_id,
537 title=document.title or "Document Text",
538 text_content=document.text_content,
539 extraction_method=document.extraction_method,
540 word_count=document.word_count,
541 )
544@library_bp.route("/api/document/<string:document_id>/text")
545@login_required
546def serve_text_api(document_id):
547 """API endpoint for serving text content (kept for backward compatibility)."""
548 username = session.get("username")
550 with get_user_db_session(username) as db_session:
551 # Get document by ID (text now stored in Document.text_content)
552 document = db_session.query(Document).filter_by(id=document_id).first()
554 if not document:
555 logger.warning(f"Document not found for document ID {document_id}")
556 return jsonify({"error": "Document not found"}), 404
558 if not document.text_content:
559 logger.warning(f"Document {document_id} has no text content")
560 return jsonify({"error": "Text content not available"}), 404
562 logger.info(
563 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
564 )
566 return jsonify(
567 {
568 "text_content": document.text_content,
569 "title": document.title or "Document",
570 "extraction_method": document.extraction_method,
571 "word_count": document.word_count,
572 }
573 )
576@library_bp.route("/api/open-folder", methods=["POST"])
577@login_required
578def open_folder():
579 """Open folder containing a document.
581 Security: This endpoint is disabled for server deployments.
582 It only makes sense for desktop usage where the server and client are on the same machine.
583 """
584 return jsonify(
585 {
586 "status": "error",
587 "message": "This feature is disabled. It is only available in desktop mode.",
588 }
589 ), 403
592@library_bp.route("/api/download/<int:resource_id>", methods=["POST"])
593@login_required
594def download_single_resource(resource_id):
595 """Download a single resource."""
596 username = session.get("username")
597 user_password = get_authenticated_user_password(username)
599 with DownloadService(username, user_password) as service:
600 success, error = service.download_resource(resource_id)
601 if success:
602 return jsonify({"success": True})
603 else:
604 logger.warning(
605 f"Download failed for resource {resource_id}: {error}"
606 )
607 return jsonify(
608 {
609 "success": False,
610 "error": "Download failed. Please try again or contact support.",
611 }
612 ), 500
615@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"])
616@login_required
617def download_text_single(resource_id):
618 """Download a single resource as text file."""
619 try:
620 username = session.get("username")
621 user_password = get_authenticated_user_password(username)
623 with DownloadService(username, user_password) as service:
624 success, error = service.download_as_text(resource_id)
626 # Sanitize error message - don't expose internal details
627 if not success:
628 if error:
629 logger.warning(
630 f"Download as text failed for resource {resource_id}: {error}"
631 )
632 return jsonify(
633 {"success": False, "error": "Failed to download resource"}
634 )
636 return jsonify({"success": True, "error": None})
637 except Exception as e:
638 return handle_api_error(
639 f"downloading resource {resource_id} as text", e
640 )
643@library_bp.route("/api/download-all-text", methods=["POST"])
644@login_required
645def download_all_text():
646 """Download all undownloaded resources as text files."""
647 username = session.get("username")
648 # Capture Flask session ID to avoid scoping issues in nested function
649 flask_session_id = session.get("session_id")
651 def generate():
652 # Get user password for database operations
653 from ...web.exceptions import AuthenticationRequiredError
655 try:
656 user_password = get_authenticated_user_password(
657 username, flask_session_id
658 )
659 except AuthenticationRequiredError:
660 logger.warning(f"Authentication expired for user {username}")
661 return
663 download_service = DownloadService(username, user_password)
664 try:
665 # Get all undownloaded resources
666 with get_user_db_session(username) as session:
667 # Get resources that don't have text files yet
668 resources = session.query(ResearchResource).all()
670 # Filter resources that need text extraction
671 txt_path = Path(download_service.library_root) / "txt"
672 resources_to_process = []
674 # Pre-scan directory once to get all existing resource IDs
675 existing_resource_ids = set()
676 if txt_path.exists():
677 for txt_file in txt_path.glob("*.txt"):
678 # Extract resource ID from filename pattern *_{id}.txt
679 parts = txt_file.stem.rsplit("_", 1)
680 if len(parts) == 2:
681 try:
682 existing_resource_ids.add(int(parts[1]))
683 except ValueError:
684 pass
686 for resource in resources:
687 # Check if text file already exists using preloaded set
688 if resource.id not in existing_resource_ids:
689 resources_to_process.append(resource)
691 total = len(resources_to_process)
692 current = 0
694 logger.info(f"Found {total} resources needing text extraction")
696 for resource in resources_to_process:
697 current += 1
698 progress = (
699 int((current / total) * 100) if total > 0 else 100
700 )
702 file_name = (
703 resource.title[:50]
704 if resource
705 else f"document_{current}.txt"
706 )
708 try:
709 success, error = download_service.download_as_text(
710 resource.id
711 )
713 if success:
714 status = "success"
715 error_msg = None
716 else:
717 status = "failed"
718 error_msg = error or "Text extraction failed"
720 except Exception as e:
721 logger.exception(
722 f"Error extracting text for resource {resource.id}"
723 )
724 status = "failed"
725 error_msg = (
726 f"Text extraction failed - {type(e).__name__}"
727 )
729 # Send update
730 update = {
731 "progress": progress,
732 "current": current,
733 "total": total,
734 "file": file_name,
735 "url": resource.url, # Add the URL for UI display
736 "status": status,
737 "error": error_msg,
738 }
739 yield f"data: {json.dumps(update)}\n\n"
741 # Send completion
742 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n"
743 finally:
744 download_service.close()
746 return Response(
747 stream_with_context(generate()), mimetype="text/event-stream"
748 )
751@library_bp.route("/api/download-research/<research_id>", methods=["POST"])
752@login_required
753def download_research_pdfs(research_id):
754 """Queue all PDFs from a research session for download."""
755 username = session.get("username")
756 user_password = get_authenticated_user_password(username)
758 with DownloadService(username, user_password) as service:
759 # Get optional collection_id from request body
760 data = request.json or {}
761 collection_id = data.get("collection_id")
763 queued = service.queue_research_downloads(research_id, collection_id)
765 # Start processing queue (in production, this would be a background task)
766 # For now, we'll process synchronously
767 # TODO: Integrate with existing queue processor
769 return jsonify({"success": True, "queued": queued})
772@library_bp.route("/api/download-bulk", methods=["POST"])
773@login_required
774def download_bulk():
775 """Download PDFs or extract text from multiple research sessions."""
776 username = session.get("username")
777 data = request.json
778 research_ids = data.get("research_ids", [])
779 mode = data.get("mode", "pdf") # pdf or text_only
780 collection_id = data.get(
781 "collection_id"
782 ) # Optional: target collection for downloads
784 if not research_ids:
785 return jsonify({"error": "No research IDs provided"}), 400
787 # Capture Flask session ID to avoid scoping issues in nested function
788 flask_session_id = session.get("session_id")
790 def generate():
791 """Generate progress updates as Server-Sent Events."""
792 # Get user password for database operations
793 from ...web.exceptions import AuthenticationRequiredError
795 try:
796 user_password = get_authenticated_user_password(
797 username, flask_session_id
798 )
799 except AuthenticationRequiredError:
800 return
802 download_service = DownloadService(username, user_password)
803 try:
804 # Count total pending queue items across all research IDs
805 total = 0
806 current = 0
808 with get_user_db_session(username) as session:
809 for research_id in research_ids:
810 count = (
811 session.query(LibraryDownloadQueue)
812 .filter_by(
813 research_id=research_id,
814 status=DocumentStatus.PENDING,
815 )
816 .count()
817 )
818 total += count
819 logger.debug(
820 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue"
821 )
823 logger.info(
824 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}"
825 )
826 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n"
828 # Process each research
829 for research_id in research_ids:
830 # Get queued downloads for this research
831 with get_user_db_session(username) as session:
832 # Get pending queue items for this research
833 queue_items = (
834 session.query(LibraryDownloadQueue)
835 .filter_by(
836 research_id=research_id,
837 status=DocumentStatus.PENDING,
838 )
839 .all()
840 )
842 # If no items queued yet, queue them now
843 if not queue_items:
844 try:
845 download_service.queue_research_downloads(
846 research_id, collection_id
847 )
848 # Re-fetch queue items
849 queue_items = (
850 session.query(LibraryDownloadQueue)
851 .filter_by(
852 research_id=research_id, status="pending"
853 )
854 .all()
855 )
856 except Exception:
857 logger.exception(
858 f"Error queueing downloads for research {research_id}"
859 )
860 # Continue with empty queue_items
861 queue_items = []
863 # Process each queued item
864 for queue_item in queue_items:
865 logger.debug(
866 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})"
867 )
868 current += 1
869 logger.debug(
870 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})"
871 )
873 # Check for division issues
874 if total is None:
875 logger.error(
876 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash"
877 )
878 total = 0
880 progress = (
881 int((current / total) * 100) if total > 0 else 100
882 )
883 logger.debug(
884 f"[PROGRESS_DEBUG] Calculated progress: {progress}%"
885 )
887 # Get resource info
888 resource = session.query(ResearchResource).get(
889 queue_item.resource_id
890 )
891 file_name = (
892 resource.title[:50]
893 if resource
894 else f"document_{current}.pdf"
895 )
897 # Attempt actual download with error handling
898 skip_reason = None
899 status = "skipped" # Default to skipped
900 success = False
901 error_msg = None
903 try:
904 logger.debug(
905 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}"
906 )
908 # Call appropriate service method based on mode
909 if mode == "pdf":
910 result = download_service.download_resource(
911 queue_item.resource_id
912 )
913 else: # text_only
914 result = download_service.download_as_text(
915 queue_item.resource_id
916 )
918 # Handle new tuple return format
919 if isinstance(result, tuple):
920 success, skip_reason = result
921 else:
922 success = result
923 skip_reason = None
925 status = "success" if success else "skipped"
926 if skip_reason and not success:
927 error_msg = skip_reason
928 logger.info(
929 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}"
930 )
932 logger.debug(
933 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}"
934 )
935 except Exception as e:
936 # Log error but continue processing
937 error_msg = str(e)
938 error_type = type(e).__name__
939 logger.info(
940 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}"
941 )
942 # Check if this is a skip reason (not a real error)
943 # Use error category + categorized message for user display
944 if any(
945 phrase in error_msg.lower()
946 for phrase in [
947 "paywall",
948 "subscription",
949 "not available",
950 "not found",
951 "no free",
952 "embargoed",
953 "forbidden",
954 "not accessible",
955 ]
956 ):
957 status = "skipped"
958 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}"
959 elif any(
960 phrase in error_msg.lower()
961 for phrase in [
962 "failed to download",
963 "could not",
964 "invalid",
965 "server",
966 ]
967 ):
968 status = "failed"
969 skip_reason = f"Download failed - {error_type}"
970 else:
971 status = "failed"
972 skip_reason = (
973 f"Processing failed - {error_type}"
974 )
975 success = False
977 # Ensure skip_reason is set if we have an error message
978 if error_msg and not skip_reason:
979 skip_reason = f"Processing failed - {error_type}"
980 logger.debug(
981 f"Setting skip_reason from error_msg: {error_msg}"
982 )
984 # Send progress update
985 update_data = {
986 "progress": progress,
987 "current": current,
988 "total": total,
989 "file": file_name,
990 "status": status,
991 }
992 # Add skip reason if available
993 if skip_reason:
994 update_data["error"] = skip_reason
995 logger.info(
996 f"Sending skip reason to UI: {skip_reason}"
997 )
999 logger.info(f"Update data being sent: {update_data}")
1000 yield f"data: {json.dumps(update_data)}\n\n"
1002 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n"
1003 finally:
1004 download_service.close()
1006 return Response(
1007 stream_with_context(generate()),
1008 mimetype="text/event-stream",
1009 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
1010 )
1013@library_bp.route("/api/research-list")
1014@login_required
1015def get_research_list():
1016 """Get list of research sessions with download stats."""
1017 username = session.get("username")
1018 service = LibraryService(username)
1019 research_list = service.get_research_list_with_stats()
1020 return jsonify({"research": research_list})
1023@library_bp.route("/api/sync-library", methods=["POST"])
1024@login_required
1025def sync_library():
1026 """Sync library database with filesystem."""
1027 username = session.get("username")
1028 service = LibraryService(username)
1029 stats = service.sync_library_with_filesystem()
1030 return jsonify(stats)
1033@library_bp.route("/api/mark-redownload", methods=["POST"])
1034@login_required
1035def mark_for_redownload():
1036 """Mark documents for re-download."""
1037 username = session.get("username")
1038 service = LibraryService(username)
1040 data = request.json
1041 document_ids = data.get("document_ids", [])
1043 if not document_ids:
1044 return jsonify({"error": "No document IDs provided"}), 400
1046 count = service.mark_for_redownload(document_ids)
1047 return jsonify({"success": True, "marked": count})
1050@library_bp.route("/api/queue-all-undownloaded", methods=["POST"])
1051@login_required
1052def queue_all_undownloaded():
1053 """Queue all articles that haven't been downloaded yet."""
1054 username = session.get("username")
1056 logger.info(f"queue_all_undownloaded called for user {username}")
1058 with get_user_db_session(username) as db_session:
1059 # Find all resources that don't have a completed download
1060 undownloaded = (
1061 db_session.query(ResearchResource)
1062 .outerjoin(
1063 Document,
1064 (ResearchResource.id == Document.resource_id)
1065 & (Document.status == "completed"),
1066 )
1067 .filter(Document.id.is_(None))
1068 .all()
1069 )
1071 logger.info(f"Found {len(undownloaded)} total undownloaded resources")
1073 # Get user password for encrypted database access
1074 user_password = get_authenticated_user_password(username)
1076 resource_filter = ResourceFilter(username, user_password)
1077 filter_results = resource_filter.filter_downloadable_resources(
1078 undownloaded
1079 )
1081 # Get detailed filtering summary
1082 filter_summary = resource_filter.get_filter_summary(undownloaded)
1083 skipped_info = resource_filter.get_skipped_resources_info(undownloaded)
1085 logger.info(f"Filter results: {filter_summary.to_dict()}")
1087 queued_count = 0
1088 research_ids = set()
1089 skipped_count = 0
1091 # Convert filter_results to dict for O(1) lookup instead of O(n²)
1092 filter_results_by_id = {r.resource_id: r for r in filter_results}
1094 for resource in undownloaded:
1095 # Check if resource passed the smart filter
1096 filter_result = filter_results_by_id.get(resource.id)
1098 if not filter_result or not filter_result.can_retry:
1099 skipped_count += 1
1100 if filter_result:
1101 logger.debug(
1102 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}"
1103 )
1104 else:
1105 logger.debug(
1106 f"Skipping resource {resource.id} - no filter result available"
1107 )
1108 continue
1110 # Check if it's downloadable using proper URL parsing
1111 if not resource.url:
1112 skipped_count += 1
1113 continue
1115 is_downloadable = is_downloadable_domain(resource.url)
1117 # Log what we're checking
1118 if resource.url and "pubmed" in resource.url.lower():
1119 logger.info(f"Found PubMed URL: {resource.url[:100]}")
1121 if not is_downloadable:
1122 skipped_count += 1
1123 logger.debug(
1124 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}"
1125 )
1126 continue
1128 # Check if already in queue (any status)
1129 existing_queue = (
1130 db_session.query(LibraryDownloadQueue)
1131 .filter_by(resource_id=resource.id)
1132 .first()
1133 )
1135 if existing_queue:
1136 # If it exists but isn't pending, reset it to pending
1137 if existing_queue.status != DocumentStatus.PENDING:
1138 existing_queue.status = DocumentStatus.PENDING
1139 existing_queue.completed_at = None
1140 queued_count += 1
1141 research_ids.add(resource.research_id)
1142 logger.debug(
1143 f"Reset queue entry for resource {resource.id} to pending"
1144 )
1145 else:
1146 # Already pending, still count it
1147 queued_count += 1
1148 research_ids.add(resource.research_id)
1149 logger.debug(
1150 f"Resource {resource.id} already pending in queue"
1151 )
1152 else:
1153 # Add new entry to queue
1154 queue_entry = LibraryDownloadQueue(
1155 resource_id=resource.id,
1156 research_id=resource.research_id,
1157 priority=0,
1158 status=DocumentStatus.PENDING,
1159 )
1160 db_session.add(queue_entry)
1161 queued_count += 1
1162 research_ids.add(resource.research_id)
1163 logger.debug(
1164 f"Added new queue entry for resource {resource.id}"
1165 )
1167 db_session.commit()
1169 logger.info(
1170 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)"
1171 )
1173 # Note: Removed synchronous download processing here to avoid blocking the HTTP request
1174 # Downloads will be processed via the SSE streaming endpoint or background tasks
1176 return jsonify(
1177 {
1178 "success": True,
1179 "queued": queued_count,
1180 "research_ids": list(research_ids),
1181 "total_undownloaded": len(undownloaded),
1182 "skipped": skipped_count,
1183 "filter_summary": filter_summary.to_dict(),
1184 "skipped_details": skipped_info,
1185 }
1186 )
1189@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"])
1190@login_required
1191def get_research_sources(research_id):
1192 """Get all sources for a research with snippets."""
1193 username = session.get("username")
1195 sources = []
1196 with get_user_db_session(username) as db_session:
1197 # Get all resources for this research
1198 resources = (
1199 db_session.query(ResearchResource)
1200 .filter_by(research_id=research_id)
1201 .order_by(ResearchResource.created_at)
1202 .all()
1203 )
1205 for idx, resource in enumerate(resources, 1): 1205 ↛ 1207line 1205 didn't jump to line 1207 because the loop on line 1205 never started
1206 # Check if document exists
1207 document = (
1208 db_session.query(Document)
1209 .filter_by(resource_id=resource.id)
1210 .first()
1211 )
1213 # Get domain from URL
1214 domain = ""
1215 if resource.url:
1216 try:
1217 from urllib.parse import urlparse
1219 domain = urlparse(resource.url).hostname or ""
1220 except (ValueError, AttributeError):
1221 # urlparse can raise ValueError for malformed URLs
1222 pass
1224 source_data = {
1225 "number": idx,
1226 "resource_id": resource.id,
1227 "url": resource.url,
1228 "title": resource.title or f"Source {idx}",
1229 "snippet": resource.content_preview or "",
1230 "domain": domain,
1231 "relevance_score": getattr(resource, "relevance_score", None),
1232 "downloaded": False,
1233 "document_id": None,
1234 "file_type": None,
1235 }
1237 if document and document.status == "completed":
1238 source_data.update(
1239 {
1240 "downloaded": True,
1241 "document_id": document.id,
1242 "file_type": document.file_type,
1243 "download_date": document.created_at.isoformat()
1244 if document.created_at
1245 else None,
1246 }
1247 )
1249 sources.append(source_data)
1251 return jsonify({"success": True, "sources": sources, "total": len(sources)})
1254@library_bp.route("/api/check-downloads", methods=["POST"])
1255@login_required
1256def check_downloads():
1257 """Check download status for a list of URLs."""
1258 username = session.get("username")
1259 data = request.json
1260 research_id = data.get("research_id")
1261 urls = data.get("urls", [])
1263 if not research_id or not urls: 1263 ↛ 1266line 1263 didn't jump to line 1266 because the condition on line 1263 was always true
1264 return jsonify({"error": "Missing research_id or urls"}), 400
1266 download_status = {}
1268 with get_user_db_session(username) as db_session:
1269 # Get all resources for this research
1270 resources = (
1271 db_session.query(ResearchResource)
1272 .filter_by(research_id=research_id)
1273 .filter(ResearchResource.url.in_(urls))
1274 .all()
1275 )
1277 for resource in resources:
1278 # Check if document exists
1279 document = (
1280 db_session.query(Document)
1281 .filter_by(resource_id=resource.id)
1282 .first()
1283 )
1285 if document and document.status == "completed":
1286 download_status[resource.url] = {
1287 "downloaded": True,
1288 "document_id": document.id,
1289 "file_path": document.file_path,
1290 "file_type": document.file_type,
1291 "title": document.title or resource.title,
1292 }
1293 else:
1294 download_status[resource.url] = {
1295 "downloaded": False,
1296 "resource_id": resource.id,
1297 }
1299 return jsonify({"download_status": download_status})
1302@library_bp.route("/api/download-source", methods=["POST"])
1303@login_required
1304def download_source():
1305 """Download a single source from a research."""
1306 username = session.get("username")
1307 user_password = get_authenticated_user_password(username)
1308 data = request.json
1309 research_id = data.get("research_id")
1310 url = data.get("url")
1312 if not research_id or not url: 1312 ↛ 1316line 1312 didn't jump to line 1316 because the condition on line 1312 was always true
1313 return jsonify({"error": "Missing research_id or url"}), 400
1315 # Check if URL is downloadable
1316 if not is_downloadable_domain(url):
1317 return jsonify({"error": "URL is not from a downloadable domain"}), 400
1319 with get_user_db_session(username) as db_session:
1320 # Find the resource
1321 resource = (
1322 db_session.query(ResearchResource)
1323 .filter_by(research_id=research_id, url=url)
1324 .first()
1325 )
1327 if not resource:
1328 return jsonify({"error": "Resource not found"}), 404
1330 # Check if already downloaded
1331 existing = (
1332 db_session.query(Document)
1333 .filter_by(resource_id=resource.id)
1334 .first()
1335 )
1337 if existing and existing.download_status == "completed":
1338 return jsonify(
1339 {
1340 "success": True,
1341 "message": "Already downloaded",
1342 "document_id": existing.id,
1343 }
1344 )
1346 # Add to download queue
1347 queue_entry = (
1348 db_session.query(LibraryDownloadQueue)
1349 .filter_by(resource_id=resource.id)
1350 .first()
1351 )
1353 if not queue_entry:
1354 queue_entry = LibraryDownloadQueue(
1355 resource_id=resource.id,
1356 research_id=resource.research_id,
1357 priority=1, # Higher priority for manual downloads
1358 status=DocumentStatus.PENDING,
1359 )
1360 db_session.add(queue_entry)
1361 else:
1362 queue_entry.status = DocumentStatus.PENDING
1363 queue_entry.priority = 1
1365 db_session.commit()
1367 # Start download immediately
1368 with DownloadService(username, user_password) as service:
1369 success, message = service.download_resource(resource.id)
1371 if success:
1372 return jsonify(
1373 {"success": True, "message": "Download completed"}
1374 )
1375 else:
1376 # Log internal message, but show only generic message to user
1377 return jsonify({"success": False, "message": "Download failed"})