Coverage for src/local_deep_research/research_library/routes/library_routes.py: 94%
550 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Routes for Research Library and Download Manager
4Provides web endpoints for:
5- Library browsing and management
6- Download manager interface
7- API endpoints for downloads and queries
8"""
10import json
11import math
12from io import BytesIO
13from pathlib import Path
14from flask import (
15 Blueprint,
16 g,
17 jsonify,
18 request,
19 session,
20 Response,
21 send_file,
22 stream_with_context,
23)
24from loguru import logger
26from ...security.decorators import require_json_body
27from ...web.auth.decorators import login_required
28from ...web.utils.templates import render_template_with_defaults
29from ...database.session_context import get_user_db_session, safe_rollback
30from ...database.models.research import ResearchResource
31from ...database.models.library import (
32 Document as Document,
33 DocumentStatus,
34 DownloadQueue as LibraryDownloadQueue,
35 Collection,
36)
37from ...library.download_management import ResourceFilter
38from ..services.download_service import DownloadService
39from ..services.library_service import LibraryService
40from ..services.pdf_storage_manager import PDFStorageManager
41from ..utils import (
42 get_document_for_resource,
43 handle_api_error,
44 is_downloadable_domain,
45 is_downloadable_url,
46)
47from ...utilities.db_utils import get_settings_manager
48from ...config.paths import get_library_directory
49from ...web.exceptions import AuthenticationRequiredError
51# Create Blueprint
52library_bp = Blueprint("library", __name__, url_prefix="/library")
54# NOTE: Routes use session["username"] (not .get()) intentionally.
55# @login_required guarantees the key exists; direct access fails fast
56# if the decorator is ever removed.
59# Error handler for authentication errors
60@library_bp.errorhandler(Exception)
61def handle_web_api_exception(error):
62 """Handle WebAPIException and its subclasses."""
63 from ...web.exceptions import WebAPIException
65 if isinstance(error, WebAPIException):
66 return jsonify(error.to_dict()), error.status_code
67 # Re-raise other exceptions
68 raise error
71def get_authenticated_user_password(
72 username: str, flask_session_id: str | None = None
73) -> str:
74 """
75 Get authenticated user password from session store with fallback to g.user_password.
77 Args:
78 username: The username to get password for
79 flask_session_id: Optional Flask session ID. If not provided, uses session.get("session_id")
81 Returns:
82 str: The user's password
84 Raises:
85 AuthenticationRequiredError: If no password is available for the user
86 """
87 from ...database.session_passwords import session_password_store
89 session_id = flask_session_id or session.get("session_id")
91 # Try session password store first
92 try:
93 user_password = session_password_store.get_session_password(
94 username, session_id
95 )
96 if user_password:
97 logger.debug(
98 f"Retrieved user password from session store for user {username}"
99 )
100 return user_password
101 except Exception:
102 logger.exception("Failed to get user password from session store")
104 # Fallback to g.user_password (set by middleware if temp_auth was used)
105 user_password = getattr(g, "user_password", None)
106 if user_password:
107 logger.debug(
108 f"Retrieved user password from g.user_password fallback for user {username}"
109 )
110 return user_password
112 # No password available
113 logger.error(f"No user password available for user {username}")
114 raise AuthenticationRequiredError(
115 message="Authentication required: Please refresh the page and log in again to access encrypted database features.",
116 )
119# ============= Page Routes =============
122@library_bp.route("/")
123@login_required
124def library_page():
125 """Main library page showing downloaded documents."""
126 username = session["username"]
127 service = LibraryService(username)
129 # Get library settings
130 from ...utilities.db_utils import get_settings_manager
132 settings = get_settings_manager()
133 pdf_storage_mode = settings.get_setting(
134 "research_library.pdf_storage_mode", "database"
135 )
136 # Enable PDF storage button if mode is not "none"
137 enable_pdf_storage = pdf_storage_mode != "none"
138 shared_library = settings.get_setting(
139 "research_library.shared_library", False
140 )
142 # Get statistics
143 stats = service.get_library_stats()
145 # Get documents with optional filters
146 domain_filter = request.args.get("domain")
147 research_filter = request.args.get("research")
148 collection_filter = request.args.get("collection") # New collection filter
149 date_filter = request.args.get("date")
151 # Resolve collection_id once to avoid redundant DB lookups
152 from ...database.library_init import get_default_library_id
154 resolved_collection = collection_filter or get_default_library_id(username)
156 # Pagination
157 per_page = 100
158 total_docs = service.count_documents(
159 research_id=research_filter,
160 domain=domain_filter,
161 collection_id=resolved_collection,
162 date_filter=date_filter,
163 )
164 total_pages = max(1, math.ceil(total_docs / per_page))
165 page = request.args.get("page", 1, type=int)
166 page = max(1, min(page, total_pages))
167 offset = (page - 1) * per_page
169 documents = service.get_documents(
170 research_id=research_filter,
171 domain=domain_filter,
172 collection_id=resolved_collection,
173 date_filter=date_filter,
174 limit=per_page,
175 offset=offset,
176 )
178 # Get unique domains for filter dropdown
179 unique_domains = service.get_unique_domains()
181 # Get research list for filter dropdown
182 research_list = service.get_research_list_for_dropdown()
184 # Get collections list for filter dropdown
185 collections = service.get_all_collections()
187 # Find default library collection ID for semantic search
188 default_collection_id = next(
189 (c["id"] for c in collections if c.get("is_default")), None
190 )
192 return render_template_with_defaults(
193 "pages/library.html",
194 stats=stats,
195 documents=documents,
196 unique_domains=unique_domains,
197 research_list=research_list,
198 collections=collections,
199 selected_collection=collection_filter,
200 default_collection_id=default_collection_id,
201 storage_path=stats.get("storage_path", ""),
202 enable_pdf_storage=enable_pdf_storage,
203 pdf_storage_mode=pdf_storage_mode,
204 shared_library=shared_library,
205 page=page,
206 total_pages=total_pages,
207 selected_date=date_filter,
208 selected_research=research_filter,
209 selected_domain=domain_filter,
210 )
213@library_bp.route("/document/<string:document_id>")
214@login_required
215def document_details_page(document_id):
216 """Document details page showing all metadata and links."""
217 username = session["username"]
218 service = LibraryService(username)
220 # Get document details
221 document = service.get_document_by_id(document_id)
223 if not document:
224 return "Document not found", 404
226 return render_template_with_defaults(
227 "pages/document_details.html", document=document
228 )
231@library_bp.route("/download-manager")
232@login_required
233def download_manager_page():
234 """Download manager page for selecting and downloading research PDFs."""
235 username = session["username"]
236 service = LibraryService(username)
238 # Get library settings
239 from ...utilities.db_utils import get_settings_manager
241 settings = get_settings_manager()
242 pdf_storage_mode = settings.get_setting(
243 "research_library.pdf_storage_mode", "database"
244 )
245 # Enable PDF storage button if mode is not "none"
246 enable_pdf_storage = pdf_storage_mode != "none"
247 shared_library = settings.get_setting(
248 "research_library.shared_library", False
249 )
251 # Summary stats over ALL sessions (also used for page count)
252 per_page = 50
253 summary = service.get_download_manager_summary_stats()
254 total_pages = max(1, math.ceil(summary["total_researches"] / per_page))
256 # Pagination with upper-bound clamp
257 page = request.args.get("page", 1, type=int)
258 page = max(1, min(page, total_pages))
259 offset = (page - 1) * per_page
261 # Get paginated research sessions
262 research_list = service.get_research_list_with_stats(
263 limit=per_page, offset=offset
264 )
266 # Batch-fetch PDF previews and domain breakdowns (single query)
267 research_ids = [r["id"] for r in research_list]
268 previews = service.get_pdf_previews_batch(research_ids)
269 for research in research_list:
270 rid = research["id"]
271 data = previews.get(rid, {"pdf_sources": [], "domains": {}})
272 research["pdf_sources"] = data["pdf_sources"]
273 research["domains"] = data["domains"]
275 return render_template_with_defaults(
276 "pages/download_manager.html",
277 research_list=research_list,
278 total_researches=summary["total_researches"],
279 total_resources=summary["total_resources"],
280 already_downloaded=summary["already_downloaded"],
281 available_to_download=summary["available_to_download"],
282 enable_pdf_storage=enable_pdf_storage,
283 pdf_storage_mode=pdf_storage_mode,
284 shared_library=shared_library,
285 page=page,
286 total_pages=total_pages,
287 )
290# ============= API Routes =============
293@library_bp.route("/api/stats")
294@login_required
295def get_library_stats():
296 """Get library statistics."""
297 username = session["username"]
298 service = LibraryService(username)
299 stats = service.get_library_stats()
300 return jsonify(stats)
303@library_bp.route("/api/collections/list")
304@login_required
305def get_collections_list():
306 """Get list of all collections for dropdown selection."""
307 username = session["username"]
309 with get_user_db_session(username) as db_session:
310 collections = (
311 db_session.query(Collection).order_by(Collection.name).all()
312 )
314 return jsonify(
315 {
316 "success": True,
317 "collections": [
318 {
319 "id": col.id,
320 "name": col.name,
321 "description": col.description,
322 }
323 for col in collections
324 ],
325 }
326 )
329@library_bp.route("/api/documents")
330@login_required
331def get_documents():
332 """Get documents with filtering."""
333 username = session["username"]
334 service = LibraryService(username)
336 # Get filter parameters
337 research_id = request.args.get("research_id")
338 domain = request.args.get("domain")
339 file_type = request.args.get("file_type")
340 favorites_only = request.args.get("favorites") == "true"
341 search_query = request.args.get("search")
342 limit = int(request.args.get("limit", 100))
343 offset = int(request.args.get("offset", 0))
345 documents = service.get_documents(
346 research_id=research_id,
347 domain=domain,
348 file_type=file_type,
349 favorites_only=favorites_only,
350 search_query=search_query,
351 limit=limit,
352 offset=offset,
353 )
355 return jsonify({"documents": documents})
358@library_bp.route(
359 "/api/document/<string:document_id>/favorite", methods=["POST"]
360)
361@login_required
362def toggle_favorite(document_id):
363 """Toggle favorite status of a document."""
364 username = session["username"]
365 service = LibraryService(username)
366 is_favorite = service.toggle_favorite(document_id)
367 return jsonify({"favorite": is_favorite})
370@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"])
371@login_required
372def delete_document(document_id):
373 """Delete a document from library."""
374 username = session["username"]
375 service = LibraryService(username)
376 success = service.delete_document(document_id)
377 return jsonify({"success": success})
380@library_bp.route("/api/document/<string:document_id>/pdf-url")
381@login_required
382def get_pdf_url(document_id):
383 """Get URL for viewing PDF."""
384 # Return URL that will serve the PDF
385 return jsonify(
386 {
387 "url": f"/library/api/document/{document_id}/pdf",
388 "title": "Document", # Could fetch actual title
389 }
390 )
393@library_bp.route("/document/<string:document_id>/pdf")
394@login_required
395def view_pdf_page(document_id):
396 """Page for viewing PDF file - uses PDFStorageManager for retrieval."""
397 username = session["username"]
399 with get_user_db_session(username) as db_session:
400 # Get document from database
401 document = db_session.query(Document).filter_by(id=document_id).first()
403 if not document:
404 logger.warning(
405 f"Document ID {document_id} not found in database for user {username}"
406 )
407 return "Document not found", 404
409 logger.info(
410 f"Document {document_id}: title='{document.title}', "
411 f"file_path={document.file_path}"
412 )
414 # Get settings for PDF storage manager
415 settings = get_settings_manager(db_session)
416 storage_mode = settings.get_setting(
417 "research_library.pdf_storage_mode", "none"
418 )
419 library_root = (
420 Path(
421 settings.get_setting(
422 "research_library.storage_path",
423 str(get_library_directory()),
424 )
425 )
426 .expanduser()
427 .resolve()
428 )
430 # Use PDFStorageManager to load PDF (handles database and filesystem)
431 pdf_manager = PDFStorageManager(library_root, storage_mode)
432 pdf_bytes = pdf_manager.load_pdf(document, db_session)
434 if pdf_bytes:
435 logger.info(
436 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)"
437 )
438 return send_file(
439 BytesIO(pdf_bytes),
440 mimetype="application/pdf",
441 as_attachment=False,
442 download_name=document.filename or "document.pdf",
443 )
445 # No PDF found anywhere
446 logger.warning(f"No PDF available for document {document_id}")
447 return "PDF not available", 404
450@library_bp.route("/api/document/<string:document_id>/pdf")
451@login_required
452def serve_pdf_api(document_id):
453 """API endpoint for serving PDF file (kept for backward compatibility)."""
454 return view_pdf_page(document_id)
457@library_bp.route("/document/<string:document_id>/txt")
458@login_required
459def view_text_page(document_id):
460 """Page for viewing text content."""
461 username = session["username"]
463 with get_user_db_session(username) as db_session:
464 # Get document by ID (text now stored in Document.text_content)
465 document = db_session.query(Document).filter_by(id=document_id).first()
467 if not document:
468 logger.warning(f"Document not found for document ID {document_id}")
469 return "Document not found", 404
471 if not document.text_content:
472 logger.warning(f"Document {document_id} has no text content")
473 return "Text content not available", 404
475 logger.info(
476 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
477 )
479 # Render as HTML page
480 return render_template_with_defaults(
481 "pages/document_text.html",
482 document_id=document_id,
483 title=document.title or "Document Text",
484 text_content=document.text_content,
485 extraction_method=document.extraction_method,
486 word_count=document.word_count,
487 )
490@library_bp.route("/api/document/<string:document_id>/text")
491@login_required
492def serve_text_api(document_id):
493 """API endpoint for serving text content (kept for backward compatibility)."""
494 username = session["username"]
496 with get_user_db_session(username) as db_session:
497 # Get document by ID (text now stored in Document.text_content)
498 document = db_session.query(Document).filter_by(id=document_id).first()
500 if not document:
501 logger.warning(f"Document not found for document ID {document_id}")
502 return jsonify({"error": "Document not found"}), 404
504 if not document.text_content:
505 logger.warning(f"Document {document_id} has no text content")
506 return jsonify({"error": "Text content not available"}), 404
508 logger.info(
509 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
510 )
512 return jsonify(
513 {
514 "text_content": document.text_content,
515 "title": document.title or "Document",
516 "extraction_method": document.extraction_method,
517 "word_count": document.word_count,
518 }
519 )
522@library_bp.route("/api/open-folder", methods=["POST"])
523@login_required
524def open_folder():
525 """Open folder containing a document.
527 Security: This endpoint is disabled for server deployments.
528 It only makes sense for desktop usage where the server and client are on the same machine.
529 """
530 return jsonify(
531 {
532 "status": "error",
533 "message": "This feature is disabled. It is only available in desktop mode.",
534 }
535 ), 403
538@library_bp.route("/api/download/<int:resource_id>", methods=["POST"])
539@login_required
540def download_single_resource(resource_id):
541 """Download a single resource."""
542 username = session["username"]
543 user_password = get_authenticated_user_password(username)
545 with DownloadService(username, user_password) as service:
546 success, error = service.download_resource(resource_id)
547 if success:
548 return jsonify({"success": True})
549 logger.warning(f"Download failed for resource {resource_id}: {error}")
550 return jsonify(
551 {
552 "success": False,
553 "error": "Download failed. Please try again or contact support.",
554 }
555 ), 500
558@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"])
559@login_required
560def download_text_single(resource_id):
561 """Download a single resource as text file."""
562 try:
563 username = session["username"]
564 user_password = get_authenticated_user_password(username)
566 with DownloadService(username, user_password) as service:
567 success, error = service.download_as_text(resource_id)
569 # Sanitize error message - don't expose internal details
570 if not success:
571 if error: 571 ↛ 575line 571 didn't jump to line 575 because the condition on line 571 was always true
572 logger.warning(
573 f"Download as text failed for resource {resource_id}: {error}"
574 )
575 return jsonify(
576 {"success": False, "error": "Failed to download resource"}
577 )
579 return jsonify({"success": True, "error": None})
580 except AuthenticationRequiredError:
581 raise # Let blueprint error handler return 401
582 except Exception as e:
583 return handle_api_error(
584 f"downloading resource {resource_id} as text", e
585 )
588@library_bp.route("/api/download-all-text", methods=["POST"])
589@login_required
590def download_all_text():
591 """Download all undownloaded resources as text files."""
592 username = session["username"]
593 # Capture Flask session ID to avoid scoping issues in nested function
594 flask_session_id = session.get("session_id")
596 def generate():
597 # Get user password for database operations
598 try:
599 user_password = get_authenticated_user_password(
600 username, flask_session_id
601 )
602 except AuthenticationRequiredError:
603 logger.warning(
604 f"Authentication unavailable for user {username} - password not in session store"
605 )
606 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n"
607 return
609 download_service = DownloadService(username, user_password)
610 try:
611 # Get all undownloaded resources
612 with get_user_db_session(username) as session:
613 # Get resources that don't have text files yet
614 all_resources = session.query(ResearchResource).all()
615 # Filter to only downloadable resources (academic/PDF)
616 resources = [
617 r for r in all_resources if is_downloadable_url(r.url)
618 ]
620 # Filter resources that need text extraction
621 txt_path = Path(download_service.library_root) / "txt"
622 resources_to_process = []
624 # Pre-scan directory once to get all existing resource IDs
625 existing_resource_ids = set()
626 if txt_path.exists(): 626 ↛ 627line 626 didn't jump to line 627 because the condition on line 626 was never true
627 for txt_file in txt_path.glob("*.txt"):
628 # Extract resource ID from filename pattern *_{id}.txt
629 parts = txt_file.stem.rsplit("_", 1)
630 if len(parts) == 2:
631 try:
632 existing_resource_ids.add(int(parts[1]))
633 except ValueError:
634 pass
636 for resource in resources:
637 # Check if text file already exists using preloaded set
638 if resource.id not in existing_resource_ids: 638 ↛ 636line 638 didn't jump to line 636 because the condition on line 638 was always true
639 resources_to_process.append(resource)
641 total = len(resources_to_process)
642 current = 0
644 logger.info(f"Found {total} resources needing text extraction")
646 for resource in resources_to_process:
647 current += 1
648 progress = (
649 int((current / total) * 100) if total > 0 else 100
650 )
652 file_name = (
653 resource.title[:50]
654 if resource
655 else f"document_{current}.txt"
656 )
658 try:
659 success, error = download_service.download_as_text(
660 resource.id
661 )
663 if success:
664 status = "success"
665 error_msg = None
666 else:
667 status = "failed"
668 error_msg = error or "Text extraction failed"
670 except Exception as e:
671 logger.exception(
672 f"Error extracting text for resource {resource.id}"
673 )
674 status = "failed"
675 error_msg = (
676 f"Text extraction failed - {type(e).__name__}"
677 )
679 # Send update
680 update = {
681 "progress": progress,
682 "current": current,
683 "total": total,
684 "file": file_name,
685 "url": resource.url, # Add the URL for UI display
686 "status": status,
687 "error": error_msg,
688 }
689 yield f"data: {json.dumps(update)}\n\n"
691 # Send completion
692 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n"
693 finally:
694 from ...utilities.resource_utils import safe_close
696 safe_close(download_service, "download service")
698 return Response(
699 stream_with_context(generate()), mimetype="text/event-stream"
700 )
703@library_bp.route("/api/download-research/<research_id>", methods=["POST"])
704@login_required
705def download_research_pdfs(research_id):
706 """Queue all PDFs from a research session for download."""
707 username = session["username"]
708 user_password = get_authenticated_user_password(username)
710 with DownloadService(username, user_password) as service:
711 # Get optional collection_id from request body
712 data = request.json or {}
713 collection_id = data.get("collection_id")
715 queued = service.queue_research_downloads(research_id, collection_id)
717 # Start processing queue (in production, this would be a background task)
718 # For now, we'll process synchronously
719 # TODO: Integrate with existing queue processor
721 return jsonify({"success": True, "queued": queued})
724@library_bp.route("/api/download-bulk", methods=["POST"])
725@login_required
726@require_json_body()
727def download_bulk():
728 """Download PDFs or extract text from multiple research sessions."""
729 username = session["username"]
730 data = request.json
731 research_ids = data.get("research_ids", [])
732 mode = data.get("mode", "pdf") # pdf or text_only
733 collection_id = data.get(
734 "collection_id"
735 ) # Optional: target collection for downloads
737 if not research_ids:
738 return jsonify({"error": "No research IDs provided"}), 400
740 # Capture Flask session ID to avoid scoping issues in nested function
741 flask_session_id = session.get("session_id")
743 def generate():
744 """Generate progress updates as Server-Sent Events."""
745 # Get user password for database operations
746 try:
747 user_password = get_authenticated_user_password(
748 username, flask_session_id
749 )
750 except AuthenticationRequiredError:
751 logger.warning(
752 f"Authentication unavailable for user {username} - password not in session store"
753 )
754 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n"
755 return
757 download_service = DownloadService(username, user_password)
758 try:
759 # Count total pending queue items across all research IDs
760 total = 0
761 current = 0
763 with get_user_db_session(username) as session:
764 for research_id in research_ids:
765 count = (
766 session.query(LibraryDownloadQueue)
767 .filter_by(
768 research_id=research_id,
769 status=DocumentStatus.PENDING,
770 )
771 .count()
772 )
773 total += count
774 logger.debug(
775 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue"
776 )
778 logger.info(
779 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}"
780 )
781 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n"
783 # Process each research
784 for research_id in research_ids:
785 # Get queued downloads for this research
786 with get_user_db_session(username) as session:
787 # Get pending queue items for this research
788 queue_items = (
789 session.query(LibraryDownloadQueue)
790 .filter_by(
791 research_id=research_id,
792 status=DocumentStatus.PENDING,
793 )
794 .all()
795 )
797 # If no items queued yet, queue them now
798 if not queue_items:
799 try:
800 download_service.queue_research_downloads(
801 research_id, collection_id
802 )
803 # Re-fetch queue items
804 queue_items = (
805 session.query(LibraryDownloadQueue)
806 .filter_by(
807 research_id=research_id, status="pending"
808 )
809 .all()
810 )
811 except Exception:
812 logger.exception(
813 f"Error queueing downloads for research {research_id}"
814 )
815 # Continue with empty queue_items
816 queue_items = []
818 # Process each queued item
819 for queue_item in queue_items:
820 logger.debug(
821 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})"
822 )
823 current += 1
824 logger.debug(
825 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})"
826 )
828 # Check for division issues
829 if total is None: 829 ↛ 830line 829 didn't jump to line 830 because the condition on line 829 was never true
830 logger.error(
831 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash"
832 )
833 total = 0
835 progress = (
836 int((current / total) * 100) if total > 0 else 100
837 )
838 logger.debug(
839 f"[PROGRESS_DEBUG] Calculated progress: {progress}%"
840 )
842 # Get resource info
843 resource = session.query(ResearchResource).get(
844 queue_item.resource_id
845 )
846 file_name = (
847 resource.title[:50]
848 if resource
849 else f"document_{current}.pdf"
850 )
852 # Attempt actual download with error handling
853 skip_reason = None
854 status = "skipped" # Default to skipped
855 success = False
856 error_msg = None
858 try:
859 logger.debug(
860 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}"
861 )
863 # Call appropriate service method based on mode
864 if mode == "pdf":
865 result = download_service.download_resource(
866 queue_item.resource_id
867 )
868 else: # text_only
869 result = download_service.download_as_text(
870 queue_item.resource_id
871 )
873 # Handle new tuple return format
874 if isinstance(result, tuple): 874 ↛ 877line 874 didn't jump to line 877 because the condition on line 874 was always true
875 success, skip_reason = result
876 else:
877 success = result
878 skip_reason = None
880 status = "success" if success else "skipped"
881 if skip_reason and not success: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true
882 error_msg = skip_reason
883 logger.info(
884 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}"
885 )
887 logger.debug(
888 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}"
889 )
890 except Exception as e:
891 # Roll back FIRST: the next loop iteration's
892 # session.query(ResearchResource).get(...) at the
893 # top of the for-body runs BEFORE the next
894 # try/except, so a poisoned session here would
895 # cascade into PendingRollbackError on the next
896 # item before this handler ever runs again
897 # (issue #3827).
898 safe_rollback(session, "SSE download")
899 # Log error but continue processing
900 error_msg = str(e)
901 error_type = type(e).__name__
902 logger.info(
903 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}"
904 )
905 # Check if this is a skip reason (not a real error)
906 # Use error category + categorized message for user display
907 if any(
908 phrase in error_msg.lower()
909 for phrase in [
910 "paywall",
911 "subscription",
912 "not available",
913 "not found",
914 "no free",
915 "embargoed",
916 "forbidden",
917 "not accessible",
918 ]
919 ):
920 status = "skipped"
921 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}"
922 elif any(
923 phrase in error_msg.lower()
924 for phrase in [
925 "failed to download",
926 "could not",
927 "invalid",
928 "server",
929 ]
930 ):
931 status = "failed"
932 skip_reason = f"Download failed - {error_type}"
933 else:
934 status = "failed"
935 skip_reason = (
936 f"Processing failed - {error_type}"
937 )
938 success = False
940 # Ensure skip_reason is set if we have an error message
941 if error_msg and not skip_reason: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true
942 skip_reason = f"Processing failed - {error_type}"
943 logger.debug(
944 f"Setting skip_reason from error_msg: {error_msg}"
945 )
947 # Send progress update
948 update_data = {
949 "progress": progress,
950 "current": current,
951 "total": total,
952 "file": file_name,
953 "status": status,
954 }
955 # Add skip reason if available
956 if skip_reason:
957 update_data["error"] = skip_reason
958 logger.info(
959 f"Sending skip reason to UI: {skip_reason}"
960 )
962 logger.info(f"Update data being sent: {update_data}")
963 yield f"data: {json.dumps(update_data)}\n\n"
965 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n"
966 finally:
967 from ...utilities.resource_utils import safe_close
969 safe_close(download_service, "download service")
971 return Response(
972 stream_with_context(generate()),
973 mimetype="text/event-stream",
974 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
975 )
978@library_bp.route("/api/research-list")
979@login_required
980def get_research_list():
981 """Get list of research sessions for dropdowns."""
982 username = session["username"]
983 service = LibraryService(username)
984 research_list = service.get_research_list_for_dropdown()
985 return jsonify({"research": research_list})
988@library_bp.route("/api/sync-library", methods=["POST"])
989@login_required
990def sync_library():
991 """Sync library database with filesystem."""
992 username = session["username"]
993 service = LibraryService(username)
994 stats = service.sync_library_with_filesystem()
995 return jsonify(stats)
998@library_bp.route("/api/mark-redownload", methods=["POST"])
999@login_required
1000@require_json_body()
1001def mark_for_redownload():
1002 """Mark documents for re-download."""
1003 username = session["username"]
1004 service = LibraryService(username)
1006 data = request.json
1007 document_ids = data.get("document_ids", [])
1009 if not document_ids:
1010 return jsonify({"error": "No document IDs provided"}), 400
1012 count = service.mark_for_redownload(document_ids)
1013 return jsonify({"success": True, "marked": count})
1016@library_bp.route("/api/queue-all-undownloaded", methods=["POST"])
1017@login_required
1018def queue_all_undownloaded():
1019 """Queue all articles that haven't been downloaded yet."""
1020 username = session["username"]
1022 logger.info(f"queue_all_undownloaded called for user {username}")
1024 with get_user_db_session(username) as db_session:
1025 # Find all resources that don't have a completed download
1026 undownloaded = (
1027 db_session.query(ResearchResource)
1028 .outerjoin(
1029 Document,
1030 (
1031 (ResearchResource.id == Document.resource_id)
1032 | (ResearchResource.document_id == Document.id)
1033 )
1034 & (Document.status == "completed"),
1035 )
1036 .filter(Document.id.is_(None))
1037 .all()
1038 )
1040 logger.info(f"Found {len(undownloaded)} total undownloaded resources")
1042 # Get user password for encrypted database access
1043 user_password = get_authenticated_user_password(username)
1045 resource_filter = ResourceFilter(username, user_password)
1046 filter_results = resource_filter.filter_downloadable_resources(
1047 undownloaded
1048 )
1050 # Get detailed filtering summary
1051 filter_summary = resource_filter.get_filter_summary(undownloaded)
1052 skipped_info = resource_filter.get_skipped_resources_info(undownloaded)
1054 logger.info(f"Filter results: {filter_summary.to_dict()}")
1056 queued_count = 0
1057 research_ids = set()
1058 skipped_count = 0
1060 # Convert filter_results to dict for O(1) lookup instead of O(n²)
1061 filter_results_by_id = {r.resource_id: r for r in filter_results}
1063 for resource in undownloaded:
1064 # Check if resource passed the smart filter
1065 filter_result = filter_results_by_id.get(resource.id)
1067 if not filter_result or not filter_result.can_retry:
1068 skipped_count += 1
1069 if filter_result: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true
1070 logger.debug(
1071 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}"
1072 )
1073 else:
1074 logger.debug(
1075 f"Skipping resource {resource.id} - no filter result available"
1076 )
1077 continue
1079 # Check if it's downloadable using proper URL parsing
1080 if not resource.url:
1081 skipped_count += 1
1082 continue
1084 is_downloadable = is_downloadable_domain(resource.url)
1086 # Log what we're checking
1087 if resource.url and "pubmed" in resource.url.lower(): 1087 ↛ 1088line 1087 didn't jump to line 1088 because the condition on line 1087 was never true
1088 logger.info(f"Found PubMed URL: {resource.url[:100]}")
1090 if not is_downloadable: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true
1091 skipped_count += 1
1092 logger.debug(
1093 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}"
1094 )
1095 continue
1097 # Check if already in queue (any status)
1098 existing_queue = (
1099 db_session.query(LibraryDownloadQueue)
1100 .filter_by(resource_id=resource.id)
1101 .first()
1102 )
1104 if existing_queue:
1105 # If it exists but isn't pending, reset it to pending
1106 if existing_queue.status != DocumentStatus.PENDING:
1107 existing_queue.status = DocumentStatus.PENDING
1108 existing_queue.completed_at = None
1109 queued_count += 1
1110 research_ids.add(resource.research_id)
1111 logger.debug(
1112 f"Reset queue entry for resource {resource.id} to pending"
1113 )
1114 else:
1115 # Already pending, still count it
1116 queued_count += 1
1117 research_ids.add(resource.research_id)
1118 logger.debug(
1119 f"Resource {resource.id} already pending in queue"
1120 )
1121 else:
1122 # Add new entry to queue
1123 queue_entry = LibraryDownloadQueue(
1124 resource_id=resource.id,
1125 research_id=resource.research_id,
1126 priority=0,
1127 status=DocumentStatus.PENDING,
1128 )
1129 db_session.add(queue_entry)
1130 queued_count += 1
1131 research_ids.add(resource.research_id)
1132 logger.debug(
1133 f"Added new queue entry for resource {resource.id}"
1134 )
1136 db_session.commit()
1138 logger.info(
1139 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)"
1140 )
1142 # Note: Removed synchronous download processing here to avoid blocking the HTTP request
1143 # Downloads will be processed via the SSE streaming endpoint or background tasks
1145 return jsonify(
1146 {
1147 "success": True,
1148 "queued": queued_count,
1149 "research_ids": list(research_ids),
1150 "total_undownloaded": len(undownloaded),
1151 "skipped": skipped_count,
1152 "filter_summary": filter_summary.to_dict(),
1153 "skipped_details": skipped_info,
1154 }
1155 )
1158@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"])
1159@login_required
1160def get_research_sources(research_id):
1161 """Get all sources for a research with snippets."""
1162 username = session["username"]
1164 sources = []
1165 with get_user_db_session(username) as db_session:
1166 # Get all resources for this research
1167 resources = (
1168 db_session.query(ResearchResource)
1169 .filter_by(research_id=research_id)
1170 .order_by(ResearchResource.created_at)
1171 .all()
1172 )
1174 for idx, resource in enumerate(resources, 1):
1175 # Check if document exists
1176 document = get_document_for_resource(db_session, resource)
1178 # Get domain from URL
1179 domain = ""
1180 if resource.url:
1181 try:
1182 from urllib.parse import urlparse
1184 domain = urlparse(resource.url).hostname or ""
1185 except (ValueError, AttributeError):
1186 # urlparse can raise ValueError for malformed URLs
1187 pass
1189 source_data = {
1190 "number": idx,
1191 "resource_id": resource.id,
1192 "url": resource.url,
1193 "title": resource.title or f"Source {idx}",
1194 "snippet": resource.content_preview or "",
1195 "domain": domain,
1196 "relevance_score": getattr(resource, "relevance_score", None),
1197 "downloaded": False,
1198 "document_id": None,
1199 "file_type": None,
1200 }
1202 if document and document.status == "completed":
1203 source_data.update(
1204 {
1205 "downloaded": True,
1206 "document_id": document.id,
1207 "file_type": document.file_type,
1208 "download_date": document.created_at.isoformat()
1209 if document.created_at
1210 else None,
1211 }
1212 )
1214 sources.append(source_data)
1216 return jsonify({"success": True, "sources": sources, "total": len(sources)})
1219@library_bp.route("/api/check-downloads", methods=["POST"])
1220@login_required
1221@require_json_body()
1222def check_downloads():
1223 """Check download status for a list of URLs."""
1224 username = session["username"]
1225 data = request.json
1226 research_id = data.get("research_id")
1227 urls = data.get("urls", [])
1229 if not research_id or not urls:
1230 return jsonify({"error": "Missing research_id or urls"}), 400
1232 download_status = {}
1234 with get_user_db_session(username) as db_session:
1235 # Get all resources for this research
1236 resources = (
1237 db_session.query(ResearchResource)
1238 .filter_by(research_id=research_id)
1239 .filter(ResearchResource.url.in_(urls))
1240 .all()
1241 )
1243 for resource in resources:
1244 # Check if document exists
1245 document = get_document_for_resource(db_session, resource)
1247 if document and document.status == "completed":
1248 download_status[resource.url] = {
1249 "downloaded": True,
1250 "document_id": document.id,
1251 "file_path": document.file_path,
1252 "file_type": document.file_type,
1253 "title": document.title or resource.title,
1254 }
1255 else:
1256 download_status[resource.url] = {
1257 "downloaded": False,
1258 "resource_id": resource.id,
1259 }
1261 return jsonify({"download_status": download_status})
1264@library_bp.route("/api/download-source", methods=["POST"])
1265@login_required
1266@require_json_body()
1267def download_source():
1268 """Download a single source from a research."""
1269 username = session["username"]
1270 user_password = get_authenticated_user_password(username)
1271 data = request.json
1272 research_id = data.get("research_id")
1273 url = data.get("url")
1275 if not research_id or not url:
1276 return jsonify({"error": "Missing research_id or url"}), 400
1278 # Check if URL is downloadable
1279 if not is_downloadable_domain(url):
1280 return jsonify({"error": "URL is not from a downloadable domain"}), 400
1282 with get_user_db_session(username) as db_session:
1283 # Find the resource
1284 resource = (
1285 db_session.query(ResearchResource)
1286 .filter_by(research_id=research_id, url=url)
1287 .first()
1288 )
1290 if not resource:
1291 return jsonify({"error": "Resource not found"}), 404
1293 # Check if already downloaded
1294 existing = get_document_for_resource(db_session, resource)
1296 if existing and existing.status == "completed":
1297 return jsonify(
1298 {
1299 "success": True,
1300 "message": "Already downloaded",
1301 "document_id": existing.id,
1302 }
1303 )
1305 # Add to download queue
1306 queue_entry = (
1307 db_session.query(LibraryDownloadQueue)
1308 .filter_by(resource_id=resource.id)
1309 .first()
1310 )
1312 if not queue_entry: 1312 ↛ 1313line 1312 didn't jump to line 1313 because the condition on line 1312 was never true
1313 queue_entry = LibraryDownloadQueue(
1314 resource_id=resource.id,
1315 research_id=resource.research_id,
1316 priority=1, # Higher priority for manual downloads
1317 status=DocumentStatus.PENDING,
1318 )
1319 db_session.add(queue_entry)
1320 else:
1321 queue_entry.status = DocumentStatus.PENDING
1322 queue_entry.priority = 1
1324 db_session.commit()
1326 # Start download immediately
1327 with DownloadService(username, user_password) as service:
1328 success, message = service.download_resource(resource.id)
1330 if success:
1331 return jsonify(
1332 {"success": True, "message": "Download completed"}
1333 )
1334 # Log internal message, but show only generic message to user
1335 return jsonify({"success": False, "message": "Download failed"})