Coverage for src / local_deep_research / research_library / routes / library_routes.py: 14%
562 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Routes for Research Library and Download Manager
4Provides web endpoints for:
5- Library browsing and management
6- Download manager interface
7- API endpoints for downloads and queries
8"""
10import json
11from io import BytesIO
12from pathlib import Path
13from urllib.parse import urlparse
14from flask import (
15 Blueprint,
16 g,
17 jsonify,
18 request,
19 session,
20 Response,
21 send_file,
22 stream_with_context,
23)
24from loguru import logger
26from ...web.auth.decorators import login_required
27from ...web.utils.templates import render_template_with_defaults
28from ...database.session_context import get_user_db_session
29from ...database.models.research import ResearchResource
30from ...database.models.library import (
31 Document as Document,
32 DocumentStatus,
33 DownloadQueue as LibraryDownloadQueue,
34 Collection,
35)
36from ...library.download_management import ResourceFilter
37from ..services.download_service import DownloadService
38from ..services.library_service import LibraryService
39from ..services.pdf_storage_manager import PDFStorageManager
40from ..utils import open_file_location, handle_api_error
41from ...security.path_validator import PathValidator
42from ...utilities.db_utils import get_settings_manager
43from ...config.paths import get_library_directory
45# Create Blueprint
46library_bp = Blueprint("library", __name__, url_prefix="/library")
49def is_downloadable_domain(url: str) -> bool:
50 """Check if URL is from a downloadable academic domain using proper URL parsing."""
51 try:
52 if not url:
53 return False
55 parsed = urlparse(url.lower())
56 hostname = parsed.hostname or ""
57 path = parsed.path or ""
58 query = parsed.query or ""
60 # Check for direct PDF files
61 if path.endswith(".pdf") or ".pdf?" in url.lower():
62 return True
64 # List of downloadable academic domains
65 downloadable_domains = [
66 "arxiv.org",
67 "biorxiv.org",
68 "medrxiv.org",
69 "ncbi.nlm.nih.gov",
70 "pubmed.ncbi.nlm.nih.gov",
71 "europepmc.org",
72 "semanticscholar.org",
73 "researchgate.net",
74 "academia.edu",
75 "sciencedirect.com",
76 "springer.com",
77 "nature.com",
78 "wiley.com",
79 "ieee.org",
80 "acm.org",
81 "plos.org",
82 "frontiersin.org",
83 "mdpi.com",
84 "acs.org",
85 "rsc.org",
86 "tandfonline.com",
87 "sagepub.com",
88 "oxford.com",
89 "cambridge.org",
90 "bmj.com",
91 "nejm.org",
92 "thelancet.com",
93 "jamanetwork.com",
94 "annals.org",
95 "ahajournals.org",
96 "cell.com",
97 "science.org",
98 "pnas.org",
99 "elifesciences.org",
100 "embopress.org",
101 "journals.asm.org",
102 "microbiologyresearch.org",
103 "jvi.asm.org",
104 "genome.cshlp.org",
105 "genetics.org",
106 "g3journal.org",
107 "plantphysiol.org",
108 "plantcell.org",
109 "aspb.org",
110 "bioone.org",
111 "company-of-biologists.org",
112 "biologists.org",
113 "jeb.biologists.org",
114 "dmm.biologists.org",
115 "bio.biologists.org",
116 "doi.org",
117 ]
119 # Check if hostname matches any downloadable domain
120 for domain in downloadable_domains:
121 if hostname == domain or hostname.endswith("." + domain):
122 return True
124 # Special case for PubMed which might appear in path
125 if "pubmed" in hostname or "/pubmed/" in path:
126 return True
128 # Check for PDF in path or query parameters
129 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query:
130 return True
132 return False
134 except Exception as e:
135 logger.warning(f"Error parsing URL {url}: {e}")
136 return False
139def get_authenticated_user_password(
140 username: str, flask_session_id: str = None
141):
142 """
143 Get authenticated user password from session store with fallback to g.user_password.
145 Args:
146 username: The username to get password for
147 flask_session_id: Optional Flask session ID. If not provided, uses session.get("_id")
149 Returns:
150 tuple: (password, error_response) where error_response is None on success,
151 or a Flask response tuple on failure
152 """
153 from ...database.session_passwords import session_password_store
155 session_id = flask_session_id or session.get("session_id")
157 # Try session password store first
158 try:
159 user_password = session_password_store.get_session_password(
160 username, session_id
161 )
162 if user_password:
163 logger.debug(
164 f"Retrieved user password from session store for user {username}"
165 )
166 return user_password, None
167 except Exception:
168 logger.exception("Failed to get user password from session store")
170 # Fallback to g.user_password (set by middleware if temp_auth was used)
171 user_password = getattr(g, "user_password", None)
172 if user_password:
173 logger.debug(
174 f"Retrieved user password from g.user_password fallback for user {username}"
175 )
176 return user_password, None
178 # No password available
179 logger.error(f"No user password available for user {username}")
180 error_response = (
181 jsonify(
182 {
183 "status": "error",
184 "message": "Authentication required: Please refresh the page and log in again to access encrypted database features.",
185 }
186 ),
187 401,
188 )
189 return None, error_response
192# ============= Page Routes =============
195@library_bp.route("/")
196@login_required
197def library_page():
198 """Main library page showing downloaded documents."""
199 username = session.get("username")
200 service = LibraryService(username)
202 # Get library settings
203 from ...utilities.db_utils import get_settings_manager
205 settings = get_settings_manager()
206 pdf_storage_mode = settings.get_setting(
207 "research_library.pdf_storage_mode", "database"
208 )
209 # Enable PDF storage button if mode is not "none"
210 enable_pdf_storage = pdf_storage_mode != "none"
211 shared_library = settings.get_setting(
212 "research_library.shared_library", False
213 )
215 # Get statistics
216 stats = service.get_library_stats()
218 # Get documents with optional filters
219 domain_filter = request.args.get("domain")
220 research_filter = request.args.get("research")
221 collection_filter = request.args.get("collection") # New collection filter
223 documents = service.get_documents(
224 research_id=research_filter,
225 domain=domain_filter,
226 collection_id=collection_filter,
227 limit=100,
228 )
230 # Get unique domains for filter dropdown
231 unique_domains = service.get_unique_domains()
233 # Get research list for filter dropdown
234 research_list = service.get_research_list_with_stats()
236 # Get collections list for filter dropdown
237 collections = service.get_all_collections()
239 return render_template_with_defaults(
240 "pages/library.html",
241 stats=stats,
242 documents=documents,
243 unique_domains=unique_domains,
244 research_list=research_list,
245 collections=collections,
246 selected_collection=collection_filter,
247 storage_path=stats.get("storage_path", ""),
248 enable_pdf_storage=enable_pdf_storage,
249 pdf_storage_mode=pdf_storage_mode,
250 shared_library=shared_library,
251 )
254@library_bp.route("/document/<string:document_id>")
255@login_required
256def document_details_page(document_id):
257 """Document details page showing all metadata and links."""
258 username = session.get("username")
259 service = LibraryService(username)
261 # Get document details
262 document = service.get_document_by_id(document_id)
264 if not document:
265 return "Document not found", 404
267 return render_template_with_defaults(
268 "pages/document_details.html", document=document
269 )
272@library_bp.route("/download-manager")
273@login_required
274def download_manager_page():
275 """Download manager page for selecting and downloading research PDFs."""
276 username = session.get("username")
277 service = LibraryService(username)
279 # Get library settings
280 from ...utilities.db_utils import get_settings_manager
282 settings = get_settings_manager()
283 pdf_storage_mode = settings.get_setting(
284 "research_library.pdf_storage_mode", "database"
285 )
286 # Enable PDF storage button if mode is not "none"
287 enable_pdf_storage = pdf_storage_mode != "none"
288 shared_library = settings.get_setting(
289 "research_library.shared_library", False
290 )
292 # Get research sessions with statistics
293 research_list = service.get_research_list_with_stats()
295 # Calculate summary statistics
296 total_researches = len(research_list)
297 total_resources = sum(r["total_resources"] for r in research_list)
298 already_downloaded = sum(r["downloaded_count"] for r in research_list)
299 available_to_download = (
300 sum(r["downloadable_count"] for r in research_list) - already_downloaded
301 )
303 # Enrich research data with domain breakdowns
304 for research in research_list:
305 # Get PDF sources for this research
306 documents = service.get_documents(
307 research_id=research["id"], file_type="pdf"
308 )
309 research["pdf_sources"] = documents[:10] # Preview first 10
311 # Domain statistics
312 domains = {}
313 for doc in documents:
314 domain = doc.get("domain", "unknown")
315 if domain not in domains:
316 domains[domain] = {"total": 0, "pdfs": 0, "downloaded": 0}
317 domains[domain]["total"] += 1
318 if doc["file_type"] == "pdf":
319 domains[domain]["pdfs"] += 1
320 if doc["download_status"] == "completed":
321 domains[domain]["downloaded"] += 1
323 research["domains"] = domains
325 return render_template_with_defaults(
326 "pages/download_manager.html",
327 research_list=research_list,
328 total_researches=total_researches,
329 total_resources=total_resources,
330 already_downloaded=already_downloaded,
331 available_to_download=available_to_download,
332 enable_pdf_storage=enable_pdf_storage,
333 pdf_storage_mode=pdf_storage_mode,
334 shared_library=shared_library,
335 )
338# ============= API Routes =============
341@library_bp.route("/api/stats")
342@login_required
343def get_library_stats():
344 """Get library statistics."""
345 username = session.get("username")
346 service = LibraryService(username)
347 stats = service.get_library_stats()
348 return jsonify(stats)
351@library_bp.route("/api/collections/list")
352@login_required
353def get_collections_list():
354 """Get list of all collections for dropdown selection."""
355 username = session.get("username")
357 with get_user_db_session(username) as db_session:
358 collections = (
359 db_session.query(Collection).order_by(Collection.name).all()
360 )
362 return jsonify(
363 {
364 "success": True,
365 "collections": [
366 {
367 "id": col.id,
368 "name": col.name,
369 "description": col.description,
370 }
371 for col in collections
372 ],
373 }
374 )
377@library_bp.route("/api/documents")
378@login_required
379def get_documents():
380 """Get documents with filtering."""
381 username = session.get("username")
382 service = LibraryService(username)
384 # Get filter parameters
385 research_id = request.args.get("research_id")
386 domain = request.args.get("domain")
387 file_type = request.args.get("file_type")
388 favorites_only = request.args.get("favorites") == "true"
389 search_query = request.args.get("search")
390 limit = int(request.args.get("limit", 100))
391 offset = int(request.args.get("offset", 0))
393 documents = service.get_documents(
394 research_id=research_id,
395 domain=domain,
396 file_type=file_type,
397 favorites_only=favorites_only,
398 search_query=search_query,
399 limit=limit,
400 offset=offset,
401 )
403 return jsonify({"documents": documents})
406@library_bp.route(
407 "/api/document/<string:document_id>/favorite", methods=["POST"]
408)
409@login_required
410def toggle_favorite(document_id):
411 """Toggle favorite status of a document."""
412 username = session.get("username")
413 service = LibraryService(username)
414 is_favorite = service.toggle_favorite(document_id)
415 return jsonify({"favorite": is_favorite})
418@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"])
419@login_required
420def delete_document(document_id):
421 """Delete a document from library."""
422 username = session.get("username")
423 service = LibraryService(username)
424 success = service.delete_document(document_id)
425 return jsonify({"success": success})
428@library_bp.route("/api/document/<string:document_id>/pdf-url")
429@login_required
430def get_pdf_url(document_id):
431 """Get URL for viewing PDF."""
432 # Return URL that will serve the PDF
433 return jsonify(
434 {
435 "url": f"/library/api/document/{document_id}/pdf",
436 "title": "Document", # Could fetch actual title
437 }
438 )
441@library_bp.route("/document/<string:document_id>/pdf")
442@login_required
443def view_pdf_page(document_id):
444 """Page for viewing PDF file - uses PDFStorageManager for retrieval."""
445 username = session.get("username")
447 with get_user_db_session(username) as db_session:
448 # Get document from database
449 document = db_session.query(Document).filter_by(id=document_id).first()
451 if not document:
452 logger.warning(
453 f"Document ID {document_id} not found in database for user {username}"
454 )
455 return "Document not found", 404
457 logger.info(
458 f"Document {document_id}: title='{document.title}', "
459 f"file_path={document.file_path}"
460 )
462 # Get settings for PDF storage manager
463 settings = get_settings_manager(db_session)
464 storage_mode = settings.get_setting(
465 "research_library.pdf_storage_mode", "none"
466 )
467 library_root = Path(
468 settings.get_setting(
469 "research_library.storage_path",
470 str(get_library_directory()),
471 )
472 ).expanduser()
474 # Use PDFStorageManager to load PDF (handles database and filesystem)
475 pdf_manager = PDFStorageManager(library_root, storage_mode)
476 pdf_bytes = pdf_manager.load_pdf(document, db_session)
478 if pdf_bytes:
479 logger.info(
480 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)"
481 )
482 return send_file(
483 BytesIO(pdf_bytes),
484 mimetype="application/pdf",
485 as_attachment=False,
486 download_name=document.filename or "document.pdf",
487 )
489 # No PDF found anywhere
490 logger.warning(f"No PDF available for document {document_id}")
491 return "PDF not available", 404
494@library_bp.route("/api/document/<string:document_id>/pdf")
495@login_required
496def serve_pdf_api(document_id):
497 """API endpoint for serving PDF file (kept for backward compatibility)."""
498 return view_pdf_page(document_id)
501@library_bp.route("/document/<string:document_id>/txt")
502@login_required
503def view_text_page(document_id):
504 """Page for viewing text content."""
505 username = session.get("username")
507 with get_user_db_session(username) as db_session:
508 # Get document by ID (text now stored in Document.text_content)
509 document = db_session.query(Document).filter_by(id=document_id).first()
511 if not document:
512 logger.warning(f"Document not found for document ID {document_id}")
513 return "Document not found", 404
515 if not document.text_content:
516 logger.warning(f"Document {document_id} has no text content")
517 return "Text content not available", 404
519 logger.info(
520 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
521 )
523 # Render as HTML page
524 return render_template_with_defaults(
525 "pages/document_text.html",
526 document_id=document_id,
527 title=document.title or "Document Text",
528 text_content=document.text_content,
529 extraction_method=document.extraction_method,
530 word_count=document.word_count,
531 )
534@library_bp.route("/api/document/<string:document_id>/text")
535@login_required
536def serve_text_api(document_id):
537 """API endpoint for serving text content (kept for backward compatibility)."""
538 username = session.get("username")
540 with get_user_db_session(username) as db_session:
541 # Get document by ID (text now stored in Document.text_content)
542 document = db_session.query(Document).filter_by(id=document_id).first()
544 if not document:
545 logger.warning(f"Document not found for document ID {document_id}")
546 return jsonify({"error": "Document not found"}), 404
548 if not document.text_content:
549 logger.warning(f"Document {document_id} has no text content")
550 return jsonify({"error": "Text content not available"}), 404
552 logger.info(
553 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
554 )
556 return jsonify(
557 {
558 "text_content": document.text_content,
559 "title": document.title or "Document",
560 "extraction_method": document.extraction_method,
561 "word_count": document.word_count,
562 }
563 )
566@library_bp.route("/api/open-folder", methods=["POST"])
567@login_required
568def open_folder():
569 """Open folder containing a document."""
570 data = request.json
571 path = data.get("path")
573 if not path:
574 return jsonify({"success": False, "error": "Path not provided"})
576 try:
577 # Get library root path from settings (uses centralized path, respects LDR_DATA_DIR)
578 settings = get_settings_manager()
579 library_root = (
580 Path(
581 settings.get_setting(
582 "research_library.storage_path",
583 str(get_library_directory()),
584 )
585 )
586 .expanduser()
587 .resolve()
588 )
590 # Validate the path is within library root
591 validated_path = PathValidator.validate_safe_path(
592 path, library_root, allow_absolute=False
593 )
595 if not validated_path or not validated_path.exists():
596 return jsonify(
597 {"success": False, "error": "Invalid or non-existent path"}
598 )
600 # Use centralized file location opener
601 success = open_file_location(str(validated_path))
602 return jsonify({"success": success})
603 except ValueError as e:
604 logger.warning(f"Path validation failed: {e}")
605 return jsonify({"success": False, "error": "Invalid path"})
606 except Exception:
607 logger.exception("Failed to open folder")
608 return jsonify(
609 {"success": False, "error": "An internal error has occurred."}
610 )
613@library_bp.route("/api/download/<int:resource_id>", methods=["POST"])
614@login_required
615def download_single_resource(resource_id):
616 """Download a single resource."""
617 username = session.get("username")
618 user_password, error_response = get_authenticated_user_password(username)
619 if error_response:
620 return error_response
621 service = DownloadService(username, user_password)
623 success, error = service.download_resource(resource_id)
624 if success:
625 return jsonify({"success": True})
626 else:
627 logger.warning(f"Download failed for resource {resource_id}: {error}")
628 return jsonify(
629 {
630 "success": False,
631 "error": "Download failed. Please try again or contact support.",
632 }
633 ), 500
636@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"])
637@login_required
638def download_text_single(resource_id):
639 """Download a single resource as text file."""
640 try:
641 username = session.get("username")
642 user_password, error_response = get_authenticated_user_password(
643 username
644 )
645 if error_response:
646 return error_response
647 service = DownloadService(username, user_password)
649 success, error = service.download_as_text(resource_id)
651 # Sanitize error message - don't expose internal details
652 if not success:
653 if error:
654 logger.warning(
655 f"Download as text failed for resource {resource_id}: {error}"
656 )
657 return jsonify(
658 {"success": False, "error": "Failed to download resource"}
659 )
661 return jsonify({"success": True, "error": None})
662 except Exception as e:
663 return handle_api_error(
664 f"downloading resource {resource_id} as text", e
665 )
668@library_bp.route("/api/download-all-text", methods=["POST"])
669@login_required
670def download_all_text():
671 """Download all undownloaded resources as text files."""
672 username = session.get("username")
673 # Capture Flask session ID to avoid scoping issues in nested function
674 flask_session_id = session.get("session_id")
676 def generate():
677 # Get user password for database operations
678 user_password, _ = get_authenticated_user_password(
679 username, flask_session_id
680 )
681 if user_password is None:
682 logger.error(f"Could not get password for user {username}")
683 return
685 download_service = DownloadService(username, user_password)
687 # Get all undownloaded resources
688 with get_user_db_session(username) as session:
689 # Get resources that don't have text files yet
690 resources = session.query(ResearchResource).all()
692 # Filter resources that need text extraction
693 txt_path = Path(download_service.library_root) / "txt"
694 resources_to_process = []
696 for resource in resources:
697 # Check if text file already exists
698 if txt_path.exists():
699 existing = list(txt_path.glob(f"*_{resource.id}.txt"))
700 if not existing:
701 resources_to_process.append(resource)
702 else:
703 resources_to_process.append(resource)
705 total = len(resources_to_process)
706 current = 0
708 logger.info(f"Found {total} resources needing text extraction")
710 for resource in resources_to_process:
711 current += 1
712 progress = int((current / total) * 100) if total > 0 else 100
714 file_name = (
715 resource.title[:50]
716 if resource
717 else f"document_{current}.txt"
718 )
720 try:
721 success, error = download_service.download_as_text(
722 resource.id
723 )
725 if success:
726 status = "success"
727 error_msg = None
728 else:
729 status = "failed"
730 error_msg = error or "Text extraction failed"
732 except Exception as e:
733 logger.exception(
734 f"Error extracting text for resource {resource.id}"
735 )
736 status = "failed"
737 error_msg = f"Text extraction failed - {type(e).__name__}"
739 # Send update
740 update = {
741 "progress": progress,
742 "current": current,
743 "total": total,
744 "file": file_name,
745 "url": resource.url, # Add the URL for UI display
746 "status": status,
747 "error": error_msg,
748 }
749 yield f"data: {json.dumps(update)}\n\n"
751 # Send completion
752 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n"
754 return Response(
755 stream_with_context(generate()), mimetype="text/event-stream"
756 )
759@library_bp.route("/api/download-research/<research_id>", methods=["POST"])
760@login_required
761def download_research_pdfs(research_id):
762 """Queue all PDFs from a research session for download."""
763 username = session.get("username")
764 user_password, error_response = get_authenticated_user_password(username)
765 if error_response:
766 return error_response
767 service = DownloadService(username, user_password)
769 # Get optional collection_id from request body
770 data = request.json or {}
771 collection_id = data.get("collection_id")
773 queued = service.queue_research_downloads(research_id, collection_id)
775 # Start processing queue (in production, this would be a background task)
776 # For now, we'll process synchronously
777 # TODO: Integrate with existing queue processor
779 return jsonify({"success": True, "queued": queued})
782@library_bp.route("/api/download-bulk", methods=["POST"])
783@login_required
784def download_bulk():
785 """Download PDFs or extract text from multiple research sessions."""
786 username = session.get("username")
787 data = request.json
788 research_ids = data.get("research_ids", [])
789 mode = data.get("mode", "pdf") # pdf or text_only
790 collection_id = data.get(
791 "collection_id"
792 ) # Optional: target collection for downloads
794 if not research_ids:
795 return jsonify({"error": "No research IDs provided"}), 400
797 # Capture Flask session ID to avoid scoping issues in nested function
798 flask_session_id = session.get("session_id")
800 def generate():
801 """Generate progress updates as Server-Sent Events."""
802 # Get user password for database operations
803 user_password, _ = get_authenticated_user_password(
804 username, flask_session_id
805 )
806 if user_password is None:
807 return
809 download_service = DownloadService(username, user_password)
811 # Count total pending queue items across all research IDs
812 total = 0
813 current = 0
815 with get_user_db_session(username) as session:
816 for research_id in research_ids:
817 count = (
818 session.query(LibraryDownloadQueue)
819 .filter_by(
820 research_id=research_id, status=DocumentStatus.PENDING
821 )
822 .count()
823 )
824 total += count
825 logger.debug(
826 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue"
827 )
829 logger.info(
830 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}"
831 )
832 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n"
834 # Process each research
835 for research_id in research_ids:
836 # Get queued downloads for this research
837 with get_user_db_session(username) as session:
838 # Get pending queue items for this research
839 queue_items = (
840 session.query(LibraryDownloadQueue)
841 .filter_by(
842 research_id=research_id, status=DocumentStatus.PENDING
843 )
844 .all()
845 )
847 # If no items queued yet, queue them now
848 if not queue_items:
849 try:
850 download_service.queue_research_downloads(
851 research_id, collection_id
852 )
853 # Re-fetch queue items
854 queue_items = (
855 session.query(LibraryDownloadQueue)
856 .filter_by(
857 research_id=research_id, status="pending"
858 )
859 .all()
860 )
861 except Exception:
862 logger.exception(
863 f"Error queueing downloads for research {research_id}"
864 )
865 # Continue with empty queue_items
866 queue_items = []
868 # Process each queued item
869 for queue_item in queue_items:
870 logger.debug(
871 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})"
872 )
873 current += 1
874 logger.debug(
875 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})"
876 )
878 # Check for division issues
879 if total is None:
880 logger.error(
881 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash"
882 )
883 total = 0
885 progress = (
886 int((current / total) * 100) if total > 0 else 100
887 )
888 logger.debug(
889 f"[PROGRESS_DEBUG] Calculated progress: {progress}%"
890 )
892 # Get resource info
893 resource = session.query(ResearchResource).get(
894 queue_item.resource_id
895 )
896 file_name = (
897 resource.title[:50]
898 if resource
899 else f"document_{current}.pdf"
900 )
902 # Attempt actual download with error handling
903 skip_reason = None
904 status = "skipped" # Default to skipped
905 success = False
906 error_msg = None
908 try:
909 logger.debug(
910 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}"
911 )
913 # Call appropriate service method based on mode
914 if mode == "pdf":
915 result = download_service.download_resource(
916 queue_item.resource_id
917 )
918 else: # text_only
919 result = download_service.download_as_text(
920 queue_item.resource_id
921 )
923 # Handle new tuple return format
924 if isinstance(result, tuple):
925 success, skip_reason = result
926 else:
927 success = result
928 skip_reason = None
930 status = "success" if success else "skipped"
931 if skip_reason and not success:
932 error_msg = skip_reason
933 logger.info(
934 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}"
935 )
937 logger.debug(
938 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}"
939 )
940 except Exception as e:
941 # Log error but continue processing
942 error_msg = str(e)
943 error_type = type(e).__name__
944 logger.info(
945 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}"
946 )
947 # Check if this is a skip reason (not a real error)
948 # Use error category + categorized message for user display
949 if any(
950 phrase in error_msg.lower()
951 for phrase in [
952 "paywall",
953 "subscription",
954 "not available",
955 "not found",
956 "no free",
957 "embargoed",
958 "forbidden",
959 "not accessible",
960 ]
961 ):
962 status = "skipped"
963 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}"
964 elif any(
965 phrase in error_msg.lower()
966 for phrase in [
967 "failed to download",
968 "could not",
969 "invalid",
970 "server",
971 ]
972 ):
973 status = "failed"
974 skip_reason = f"Download failed - {error_type}"
975 else:
976 status = "failed"
977 skip_reason = f"Processing failed - {error_type}"
978 success = False
980 # Ensure skip_reason is set if we have an error message
981 if error_msg and not skip_reason:
982 skip_reason = f"Processing failed - {error_type}"
983 logger.debug(
984 f"Setting skip_reason from error_msg: {error_msg}"
985 )
987 # Send progress update
988 update_data = {
989 "progress": progress,
990 "current": current,
991 "total": total,
992 "file": file_name,
993 "status": status,
994 }
995 # Add skip reason if available
996 if skip_reason:
997 update_data["error"] = skip_reason
998 logger.info(f"Sending skip reason to UI: {skip_reason}")
1000 logger.info(f"Update data being sent: {update_data}")
1001 yield f"data: {json.dumps(update_data)}\n\n"
1003 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n"
1005 return Response(
1006 stream_with_context(generate()),
1007 mimetype="text/event-stream",
1008 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
1009 )
1012@library_bp.route("/api/research-list")
1013@login_required
1014def get_research_list():
1015 """Get list of research sessions with download stats."""
1016 username = session.get("username")
1017 service = LibraryService(username)
1018 research_list = service.get_research_list_with_stats()
1019 return jsonify({"research": research_list})
1022@library_bp.route("/api/sync-library", methods=["POST"])
1023@login_required
1024def sync_library():
1025 """Sync library database with filesystem."""
1026 username = session.get("username")
1027 service = LibraryService(username)
1028 stats = service.sync_library_with_filesystem()
1029 return jsonify(stats)
1032@library_bp.route("/api/mark-redownload", methods=["POST"])
1033@login_required
1034def mark_for_redownload():
1035 """Mark documents for re-download."""
1036 username = session.get("username")
1037 service = LibraryService(username)
1039 data = request.json
1040 document_ids = data.get("document_ids", [])
1042 if not document_ids:
1043 return jsonify({"error": "No document IDs provided"}), 400
1045 count = service.mark_for_redownload(document_ids)
1046 return jsonify({"success": True, "marked": count})
1049@library_bp.route("/api/queue-all-undownloaded", methods=["POST"])
1050@login_required
1051def queue_all_undownloaded():
1052 """Queue all articles that haven't been downloaded yet."""
1053 username = session.get("username")
1055 logger.info(f"queue_all_undownloaded called for user {username}")
1057 with get_user_db_session(username) as db_session:
1058 # Find all resources that don't have a completed download
1059 undownloaded = (
1060 db_session.query(ResearchResource)
1061 .outerjoin(
1062 Document,
1063 (ResearchResource.id == Document.resource_id)
1064 & (Document.status == "completed"),
1065 )
1066 .filter(Document.id.is_(None))
1067 .all()
1068 )
1070 logger.info(f"Found {len(undownloaded)} total undownloaded resources")
1072 # Get user password for encrypted database access
1073 user_password, error_response = get_authenticated_user_password(
1074 username
1075 )
1076 if error_response:
1077 return error_response
1079 resource_filter = ResourceFilter(username, user_password)
1080 filter_results = resource_filter.filter_downloadable_resources(
1081 undownloaded
1082 )
1084 # Get detailed filtering summary
1085 filter_summary = resource_filter.get_filter_summary(undownloaded)
1086 skipped_info = resource_filter.get_skipped_resources_info(undownloaded)
1088 logger.info(f"Filter results: {filter_summary.to_dict()}")
1090 queued_count = 0
1091 research_ids = set()
1092 skipped_count = 0
1094 for resource in undownloaded:
1095 # Check if resource passed the smart filter
1096 filter_result = next(
1097 (r for r in filter_results if r.resource_id == resource.id),
1098 None,
1099 )
1101 if not filter_result or not filter_result.can_retry:
1102 skipped_count += 1
1103 if filter_result:
1104 logger.debug(
1105 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}"
1106 )
1107 else:
1108 logger.debug(
1109 f"Skipping resource {resource.id} - no filter result available"
1110 )
1111 continue
1113 # Check if it's downloadable using proper URL parsing
1114 if not resource.url:
1115 skipped_count += 1
1116 continue
1118 is_downloadable = is_downloadable_domain(resource.url)
1120 # Log what we're checking
1121 if resource.url and "pubmed" in resource.url.lower():
1122 logger.info(f"Found PubMed URL: {resource.url[:100]}")
1124 if not is_downloadable:
1125 skipped_count += 1
1126 logger.debug(
1127 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}"
1128 )
1129 continue
1131 # Check if already in queue (any status)
1132 existing_queue = (
1133 db_session.query(LibraryDownloadQueue)
1134 .filter_by(resource_id=resource.id)
1135 .first()
1136 )
1138 if existing_queue:
1139 # If it exists but isn't pending, reset it to pending
1140 if existing_queue.status != DocumentStatus.PENDING:
1141 existing_queue.status = DocumentStatus.PENDING
1142 existing_queue.completed_at = None
1143 queued_count += 1
1144 research_ids.add(resource.research_id)
1145 logger.debug(
1146 f"Reset queue entry for resource {resource.id} to pending"
1147 )
1148 else:
1149 # Already pending, still count it
1150 queued_count += 1
1151 research_ids.add(resource.research_id)
1152 logger.debug(
1153 f"Resource {resource.id} already pending in queue"
1154 )
1155 else:
1156 # Add new entry to queue
1157 queue_entry = LibraryDownloadQueue(
1158 resource_id=resource.id,
1159 research_id=resource.research_id,
1160 priority=0,
1161 status=DocumentStatus.PENDING,
1162 )
1163 db_session.add(queue_entry)
1164 queued_count += 1
1165 research_ids.add(resource.research_id)
1166 logger.debug(
1167 f"Added new queue entry for resource {resource.id}"
1168 )
1170 db_session.commit()
1172 logger.info(
1173 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)"
1174 )
1176 # Note: Removed synchronous download processing here to avoid blocking the HTTP request
1177 # Downloads will be processed via the SSE streaming endpoint or background tasks
1179 return jsonify(
1180 {
1181 "success": True,
1182 "queued": queued_count,
1183 "research_ids": list(research_ids),
1184 "total_undownloaded": len(undownloaded),
1185 "skipped": skipped_count,
1186 "filter_summary": filter_summary.to_dict(),
1187 "skipped_details": skipped_info,
1188 }
1189 )
1192@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"])
1193@login_required
1194def get_research_sources(research_id):
1195 """Get all sources for a research with snippets."""
1196 username = session.get("username")
1198 sources = []
1199 with get_user_db_session(username) as db_session:
1200 # Get all resources for this research
1201 resources = (
1202 db_session.query(ResearchResource)
1203 .filter_by(research_id=research_id)
1204 .order_by(ResearchResource.created_at)
1205 .all()
1206 )
1208 for idx, resource in enumerate(resources, 1):
1209 # Check if document exists
1210 document = (
1211 db_session.query(Document)
1212 .filter_by(resource_id=resource.id)
1213 .first()
1214 )
1216 # Get domain from URL
1217 domain = ""
1218 if resource.url:
1219 try:
1220 from urllib.parse import urlparse
1222 domain = urlparse(resource.url).hostname or ""
1223 except:
1224 pass
1226 source_data = {
1227 "number": idx,
1228 "resource_id": resource.id,
1229 "url": resource.url,
1230 "title": resource.title or f"Source {idx}",
1231 "snippet": resource.content_preview or "",
1232 "domain": domain,
1233 "relevance_score": getattr(resource, "relevance_score", None),
1234 "downloaded": False,
1235 "document_id": None,
1236 "file_type": None,
1237 }
1239 if document and document.status == "completed":
1240 source_data.update(
1241 {
1242 "downloaded": True,
1243 "document_id": document.id,
1244 "file_type": document.file_type,
1245 "download_date": document.created_at.isoformat()
1246 if document.created_at
1247 else None,
1248 }
1249 )
1251 sources.append(source_data)
1253 return jsonify({"success": True, "sources": sources, "total": len(sources)})
1256@library_bp.route("/api/check-downloads", methods=["POST"])
1257@login_required
1258def check_downloads():
1259 """Check download status for a list of URLs."""
1260 username = session.get("username")
1261 data = request.json
1262 research_id = data.get("research_id")
1263 urls = data.get("urls", [])
1265 if not research_id or not urls:
1266 return jsonify({"error": "Missing research_id or urls"}), 400
1268 download_status = {}
1270 with get_user_db_session(username) as db_session:
1271 # Get all resources for this research
1272 resources = (
1273 db_session.query(ResearchResource)
1274 .filter_by(research_id=research_id)
1275 .filter(ResearchResource.url.in_(urls))
1276 .all()
1277 )
1279 for resource in resources:
1280 # Check if document exists
1281 document = (
1282 db_session.query(Document)
1283 .filter_by(resource_id=resource.id)
1284 .first()
1285 )
1287 if document and document.status == "completed":
1288 download_status[resource.url] = {
1289 "downloaded": True,
1290 "document_id": document.id,
1291 "file_path": document.file_path,
1292 "file_type": document.file_type,
1293 "title": document.title or resource.title,
1294 }
1295 else:
1296 download_status[resource.url] = {
1297 "downloaded": False,
1298 "resource_id": resource.id,
1299 }
1301 return jsonify({"download_status": download_status})
1304@library_bp.route("/api/download-source", methods=["POST"])
1305@login_required
1306def download_source():
1307 """Download a single source from a research."""
1308 username = session.get("username")
1309 user_password, error_response = get_authenticated_user_password(username)
1310 if error_response:
1311 return error_response
1312 data = request.json
1313 research_id = data.get("research_id")
1314 url = data.get("url")
1316 if not research_id or not url:
1317 return jsonify({"error": "Missing research_id or url"}), 400
1319 # Check if URL is downloadable
1320 if not is_downloadable_domain(url):
1321 return jsonify({"error": "URL is not from a downloadable domain"}), 400
1323 with get_user_db_session(username) as db_session:
1324 # Find the resource
1325 resource = (
1326 db_session.query(ResearchResource)
1327 .filter_by(research_id=research_id, url=url)
1328 .first()
1329 )
1331 if not resource:
1332 return jsonify({"error": "Resource not found"}), 404
1334 # Check if already downloaded
1335 existing = (
1336 db_session.query(Document)
1337 .filter_by(resource_id=resource.id)
1338 .first()
1339 )
1341 if existing and existing.download_status == "completed":
1342 return jsonify(
1343 {
1344 "success": True,
1345 "message": "Already downloaded",
1346 "document_id": existing.id,
1347 }
1348 )
1350 # Add to download queue
1351 queue_entry = (
1352 db_session.query(LibraryDownloadQueue)
1353 .filter_by(resource_id=resource.id)
1354 .first()
1355 )
1357 if not queue_entry:
1358 queue_entry = LibraryDownloadQueue(
1359 resource_id=resource.id,
1360 research_id=resource.research_id,
1361 priority=1, # Higher priority for manual downloads
1362 status=DocumentStatus.PENDING,
1363 )
1364 db_session.add(queue_entry)
1365 else:
1366 queue_entry.status = DocumentStatus.PENDING
1367 queue_entry.priority = 1
1369 db_session.commit()
1371 # Start download immediately
1372 service = DownloadService(username, user_password)
1373 success, message = service.download_resource(resource.id)
1375 if success:
1376 return jsonify({"success": True, "message": "Download completed"})
1377 else:
1378 # Log internal message, but show only generic message to user
1379 return jsonify({"success": False, "message": "Download failed"})