Coverage for src / local_deep_research / research_library / routes / library_routes.py: 94%
549 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Routes for Research Library and Download Manager
4Provides web endpoints for:
5- Library browsing and management
6- Download manager interface
7- API endpoints for downloads and queries
8"""
10import json
11import math
12from io import BytesIO
13from pathlib import Path
14from flask import (
15 Blueprint,
16 g,
17 jsonify,
18 request,
19 session,
20 Response,
21 send_file,
22 stream_with_context,
23)
24from loguru import logger
26from ...security.decorators import require_json_body
27from ...web.auth.decorators import login_required
28from ...web.utils.templates import render_template_with_defaults
29from ...database.session_context import get_user_db_session
30from ...database.models.research import ResearchResource
31from ...database.models.library import (
32 Document as Document,
33 DocumentStatus,
34 DownloadQueue as LibraryDownloadQueue,
35 Collection,
36)
37from ...library.download_management import ResourceFilter
38from ..services.download_service import DownloadService
39from ..services.library_service import LibraryService
40from ..services.pdf_storage_manager import PDFStorageManager
41from ..utils import (
42 get_document_for_resource,
43 handle_api_error,
44 is_downloadable_domain,
45 is_downloadable_url,
46)
47from ...utilities.db_utils import get_settings_manager
48from ...config.paths import get_library_directory
49from ...web.exceptions import AuthenticationRequiredError
51# Create Blueprint
52library_bp = Blueprint("library", __name__, url_prefix="/library")
54# NOTE: Routes use session["username"] (not .get()) intentionally.
55# @login_required guarantees the key exists; direct access fails fast
56# if the decorator is ever removed.
59# Error handler for authentication errors
60@library_bp.errorhandler(Exception)
61def handle_web_api_exception(error):
62 """Handle WebAPIException and its subclasses."""
63 from ...web.exceptions import WebAPIException
65 if isinstance(error, WebAPIException):
66 return jsonify(error.to_dict()), error.status_code
67 # Re-raise other exceptions
68 raise error
71def get_authenticated_user_password(
72 username: str, flask_session_id: str | None = None
73) -> str:
74 """
75 Get authenticated user password from session store with fallback to g.user_password.
77 Args:
78 username: The username to get password for
79 flask_session_id: Optional Flask session ID. If not provided, uses session.get("session_id")
81 Returns:
82 str: The user's password
84 Raises:
85 AuthenticationRequiredError: If no password is available for the user
86 """
87 from ...database.session_passwords import session_password_store
89 session_id = flask_session_id or session.get("session_id")
91 # Try session password store first
92 try:
93 user_password = session_password_store.get_session_password(
94 username, session_id
95 )
96 if user_password:
97 logger.debug(
98 f"Retrieved user password from session store for user {username}"
99 )
100 return user_password
101 except Exception:
102 logger.exception("Failed to get user password from session store")
104 # Fallback to g.user_password (set by middleware if temp_auth was used)
105 user_password = getattr(g, "user_password", None)
106 if user_password:
107 logger.debug(
108 f"Retrieved user password from g.user_password fallback for user {username}"
109 )
110 return user_password
112 # No password available
113 logger.error(f"No user password available for user {username}")
114 raise AuthenticationRequiredError(
115 message="Authentication required: Please refresh the page and log in again to access encrypted database features.",
116 )
119# ============= Page Routes =============
122@library_bp.route("/")
123@login_required
124def library_page():
125 """Main library page showing downloaded documents."""
126 username = session["username"]
127 service = LibraryService(username)
129 # Get library settings
130 from ...utilities.db_utils import get_settings_manager
132 settings = get_settings_manager()
133 pdf_storage_mode = settings.get_setting(
134 "research_library.pdf_storage_mode", "database"
135 )
136 # Enable PDF storage button if mode is not "none"
137 enable_pdf_storage = pdf_storage_mode != "none"
138 shared_library = settings.get_setting(
139 "research_library.shared_library", False
140 )
142 # Get statistics
143 stats = service.get_library_stats()
145 # Get documents with optional filters
146 domain_filter = request.args.get("domain")
147 research_filter = request.args.get("research")
148 collection_filter = request.args.get("collection") # New collection filter
149 date_filter = request.args.get("date")
151 # Resolve collection_id once to avoid redundant DB lookups
152 from ...database.library_init import get_default_library_id
154 resolved_collection = collection_filter or get_default_library_id(username)
156 # Pagination
157 per_page = 100
158 total_docs = service.count_documents(
159 research_id=research_filter,
160 domain=domain_filter,
161 collection_id=resolved_collection,
162 date_filter=date_filter,
163 )
164 total_pages = max(1, math.ceil(total_docs / per_page))
165 page = request.args.get("page", 1, type=int)
166 page = max(1, min(page, total_pages))
167 offset = (page - 1) * per_page
169 documents = service.get_documents(
170 research_id=research_filter,
171 domain=domain_filter,
172 collection_id=resolved_collection,
173 date_filter=date_filter,
174 limit=per_page,
175 offset=offset,
176 )
178 # Get unique domains for filter dropdown
179 unique_domains = service.get_unique_domains()
181 # Get research list for filter dropdown
182 research_list = service.get_research_list_for_dropdown()
184 # Get collections list for filter dropdown
185 collections = service.get_all_collections()
187 # Find default library collection ID for semantic search
188 default_collection_id = next(
189 (c["id"] for c in collections if c.get("is_default")), None
190 )
192 return render_template_with_defaults(
193 "pages/library.html",
194 stats=stats,
195 documents=documents,
196 unique_domains=unique_domains,
197 research_list=research_list,
198 collections=collections,
199 selected_collection=collection_filter,
200 default_collection_id=default_collection_id,
201 storage_path=stats.get("storage_path", ""),
202 enable_pdf_storage=enable_pdf_storage,
203 pdf_storage_mode=pdf_storage_mode,
204 shared_library=shared_library,
205 page=page,
206 total_pages=total_pages,
207 selected_date=date_filter,
208 selected_research=research_filter,
209 selected_domain=domain_filter,
210 )
213@library_bp.route("/document/<string:document_id>")
214@login_required
215def document_details_page(document_id):
216 """Document details page showing all metadata and links."""
217 username = session["username"]
218 service = LibraryService(username)
220 # Get document details
221 document = service.get_document_by_id(document_id)
223 if not document:
224 return "Document not found", 404
226 return render_template_with_defaults(
227 "pages/document_details.html", document=document
228 )
231@library_bp.route("/download-manager")
232@login_required
233def download_manager_page():
234 """Download manager page for selecting and downloading research PDFs."""
235 username = session["username"]
236 service = LibraryService(username)
238 # Get library settings
239 from ...utilities.db_utils import get_settings_manager
241 settings = get_settings_manager()
242 pdf_storage_mode = settings.get_setting(
243 "research_library.pdf_storage_mode", "database"
244 )
245 # Enable PDF storage button if mode is not "none"
246 enable_pdf_storage = pdf_storage_mode != "none"
247 shared_library = settings.get_setting(
248 "research_library.shared_library", False
249 )
251 # Summary stats over ALL sessions (also used for page count)
252 per_page = 50
253 summary = service.get_download_manager_summary_stats()
254 total_pages = max(1, math.ceil(summary["total_researches"] / per_page))
256 # Pagination with upper-bound clamp
257 page = request.args.get("page", 1, type=int)
258 page = max(1, min(page, total_pages))
259 offset = (page - 1) * per_page
261 # Get paginated research sessions
262 research_list = service.get_research_list_with_stats(
263 limit=per_page, offset=offset
264 )
266 # Batch-fetch PDF previews and domain breakdowns (single query)
267 research_ids = [r["id"] for r in research_list]
268 previews = service.get_pdf_previews_batch(research_ids)
269 for research in research_list:
270 rid = research["id"]
271 data = previews.get(rid, {"pdf_sources": [], "domains": {}})
272 research["pdf_sources"] = data["pdf_sources"]
273 research["domains"] = data["domains"]
275 return render_template_with_defaults(
276 "pages/download_manager.html",
277 research_list=research_list,
278 total_researches=summary["total_researches"],
279 total_resources=summary["total_resources"],
280 already_downloaded=summary["already_downloaded"],
281 available_to_download=summary["available_to_download"],
282 enable_pdf_storage=enable_pdf_storage,
283 pdf_storage_mode=pdf_storage_mode,
284 shared_library=shared_library,
285 page=page,
286 total_pages=total_pages,
287 )
290# ============= API Routes =============
293@library_bp.route("/api/stats")
294@login_required
295def get_library_stats():
296 """Get library statistics."""
297 username = session["username"]
298 service = LibraryService(username)
299 stats = service.get_library_stats()
300 return jsonify(stats)
303@library_bp.route("/api/collections/list")
304@login_required
305def get_collections_list():
306 """Get list of all collections for dropdown selection."""
307 username = session["username"]
309 with get_user_db_session(username) as db_session:
310 collections = (
311 db_session.query(Collection).order_by(Collection.name).all()
312 )
314 return jsonify(
315 {
316 "success": True,
317 "collections": [
318 {
319 "id": col.id,
320 "name": col.name,
321 "description": col.description,
322 }
323 for col in collections
324 ],
325 }
326 )
329@library_bp.route("/api/documents")
330@login_required
331def get_documents():
332 """Get documents with filtering."""
333 username = session["username"]
334 service = LibraryService(username)
336 # Get filter parameters
337 research_id = request.args.get("research_id")
338 domain = request.args.get("domain")
339 file_type = request.args.get("file_type")
340 favorites_only = request.args.get("favorites") == "true"
341 search_query = request.args.get("search")
342 limit = int(request.args.get("limit", 100))
343 offset = int(request.args.get("offset", 0))
345 documents = service.get_documents(
346 research_id=research_id,
347 domain=domain,
348 file_type=file_type,
349 favorites_only=favorites_only,
350 search_query=search_query,
351 limit=limit,
352 offset=offset,
353 )
355 return jsonify({"documents": documents})
358@library_bp.route(
359 "/api/document/<string:document_id>/favorite", methods=["POST"]
360)
361@login_required
362def toggle_favorite(document_id):
363 """Toggle favorite status of a document."""
364 username = session["username"]
365 service = LibraryService(username)
366 is_favorite = service.toggle_favorite(document_id)
367 return jsonify({"favorite": is_favorite})
370@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"])
371@login_required
372def delete_document(document_id):
373 """Delete a document from library."""
374 username = session["username"]
375 service = LibraryService(username)
376 success = service.delete_document(document_id)
377 return jsonify({"success": success})
380@library_bp.route("/api/document/<string:document_id>/pdf-url")
381@login_required
382def get_pdf_url(document_id):
383 """Get URL for viewing PDF."""
384 # Return URL that will serve the PDF
385 return jsonify(
386 {
387 "url": f"/library/api/document/{document_id}/pdf",
388 "title": "Document", # Could fetch actual title
389 }
390 )
393@library_bp.route("/document/<string:document_id>/pdf")
394@login_required
395def view_pdf_page(document_id):
396 """Page for viewing PDF file - uses PDFStorageManager for retrieval."""
397 username = session["username"]
399 with get_user_db_session(username) as db_session:
400 # Get document from database
401 document = db_session.query(Document).filter_by(id=document_id).first()
403 if not document:
404 logger.warning(
405 f"Document ID {document_id} not found in database for user {username}"
406 )
407 return "Document not found", 404
409 logger.info(
410 f"Document {document_id}: title='{document.title}', "
411 f"file_path={document.file_path}"
412 )
414 # Get settings for PDF storage manager
415 settings = get_settings_manager(db_session)
416 storage_mode = settings.get_setting(
417 "research_library.pdf_storage_mode", "none"
418 )
419 library_root = (
420 Path(
421 settings.get_setting(
422 "research_library.storage_path",
423 str(get_library_directory()),
424 )
425 )
426 .expanduser()
427 .resolve()
428 )
430 # Use PDFStorageManager to load PDF (handles database and filesystem)
431 pdf_manager = PDFStorageManager(library_root, storage_mode)
432 pdf_bytes = pdf_manager.load_pdf(document, db_session)
434 if pdf_bytes:
435 logger.info(
436 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)"
437 )
438 return send_file(
439 BytesIO(pdf_bytes),
440 mimetype="application/pdf",
441 as_attachment=False,
442 download_name=document.filename or "document.pdf",
443 )
445 # No PDF found anywhere
446 logger.warning(f"No PDF available for document {document_id}")
447 return "PDF not available", 404
450@library_bp.route("/api/document/<string:document_id>/pdf")
451@login_required
452def serve_pdf_api(document_id):
453 """API endpoint for serving PDF file (kept for backward compatibility)."""
454 return view_pdf_page(document_id)
457@library_bp.route("/document/<string:document_id>/txt")
458@login_required
459def view_text_page(document_id):
460 """Page for viewing text content."""
461 username = session["username"]
463 with get_user_db_session(username) as db_session:
464 # Get document by ID (text now stored in Document.text_content)
465 document = db_session.query(Document).filter_by(id=document_id).first()
467 if not document:
468 logger.warning(f"Document not found for document ID {document_id}")
469 return "Document not found", 404
471 if not document.text_content:
472 logger.warning(f"Document {document_id} has no text content")
473 return "Text content not available", 404
475 logger.info(
476 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
477 )
479 # Render as HTML page
480 return render_template_with_defaults(
481 "pages/document_text.html",
482 document_id=document_id,
483 title=document.title or "Document Text",
484 text_content=document.text_content,
485 extraction_method=document.extraction_method,
486 word_count=document.word_count,
487 )
490@library_bp.route("/api/document/<string:document_id>/text")
491@login_required
492def serve_text_api(document_id):
493 """API endpoint for serving text content (kept for backward compatibility)."""
494 username = session["username"]
496 with get_user_db_session(username) as db_session:
497 # Get document by ID (text now stored in Document.text_content)
498 document = db_session.query(Document).filter_by(id=document_id).first()
500 if not document:
501 logger.warning(f"Document not found for document ID {document_id}")
502 return jsonify({"error": "Document not found"}), 404
504 if not document.text_content:
505 logger.warning(f"Document {document_id} has no text content")
506 return jsonify({"error": "Text content not available"}), 404
508 logger.info(
509 f"Serving text content for document {document_id}: {len(document.text_content)} characters"
510 )
512 return jsonify(
513 {
514 "text_content": document.text_content,
515 "title": document.title or "Document",
516 "extraction_method": document.extraction_method,
517 "word_count": document.word_count,
518 }
519 )
522@library_bp.route("/api/open-folder", methods=["POST"])
523@login_required
524def open_folder():
525 """Open folder containing a document.
527 Security: This endpoint is disabled for server deployments.
528 It only makes sense for desktop usage where the server and client are on the same machine.
529 """
530 return jsonify(
531 {
532 "status": "error",
533 "message": "This feature is disabled. It is only available in desktop mode.",
534 }
535 ), 403
538@library_bp.route("/api/download/<int:resource_id>", methods=["POST"])
539@login_required
540def download_single_resource(resource_id):
541 """Download a single resource."""
542 username = session["username"]
543 user_password = get_authenticated_user_password(username)
545 with DownloadService(username, user_password) as service:
546 success, error = service.download_resource(resource_id)
547 if success:
548 return jsonify({"success": True})
549 logger.warning(f"Download failed for resource {resource_id}: {error}")
550 return jsonify(
551 {
552 "success": False,
553 "error": "Download failed. Please try again or contact support.",
554 }
555 ), 500
558@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"])
559@login_required
560def download_text_single(resource_id):
561 """Download a single resource as text file."""
562 try:
563 username = session["username"]
564 user_password = get_authenticated_user_password(username)
566 with DownloadService(username, user_password) as service:
567 success, error = service.download_as_text(resource_id)
569 # Sanitize error message - don't expose internal details
570 if not success:
571 if error: 571 ↛ 575line 571 didn't jump to line 575 because the condition on line 571 was always true
572 logger.warning(
573 f"Download as text failed for resource {resource_id}: {error}"
574 )
575 return jsonify(
576 {"success": False, "error": "Failed to download resource"}
577 )
579 return jsonify({"success": True, "error": None})
580 except AuthenticationRequiredError:
581 raise # Let blueprint error handler return 401
582 except Exception as e:
583 return handle_api_error(
584 f"downloading resource {resource_id} as text", e
585 )
588@library_bp.route("/api/download-all-text", methods=["POST"])
589@login_required
590def download_all_text():
591 """Download all undownloaded resources as text files."""
592 username = session["username"]
593 # Capture Flask session ID to avoid scoping issues in nested function
594 flask_session_id = session.get("session_id")
596 def generate():
597 # Get user password for database operations
598 try:
599 user_password = get_authenticated_user_password(
600 username, flask_session_id
601 )
602 except AuthenticationRequiredError:
603 logger.warning(
604 f"Authentication unavailable for user {username} - password not in session store"
605 )
606 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n"
607 return
609 download_service = DownloadService(username, user_password)
610 try:
611 # Get all undownloaded resources
612 with get_user_db_session(username) as session:
613 # Get resources that don't have text files yet
614 all_resources = session.query(ResearchResource).all()
615 # Filter to only downloadable resources (academic/PDF)
616 resources = [
617 r for r in all_resources if is_downloadable_url(r.url)
618 ]
620 # Filter resources that need text extraction
621 txt_path = Path(download_service.library_root) / "txt"
622 resources_to_process = []
624 # Pre-scan directory once to get all existing resource IDs
625 existing_resource_ids = set()
626 if txt_path.exists(): 626 ↛ 627line 626 didn't jump to line 627 because the condition on line 626 was never true
627 for txt_file in txt_path.glob("*.txt"):
628 # Extract resource ID from filename pattern *_{id}.txt
629 parts = txt_file.stem.rsplit("_", 1)
630 if len(parts) == 2:
631 try:
632 existing_resource_ids.add(int(parts[1]))
633 except ValueError:
634 pass
636 for resource in resources:
637 # Check if text file already exists using preloaded set
638 if resource.id not in existing_resource_ids: 638 ↛ 636line 638 didn't jump to line 636 because the condition on line 638 was always true
639 resources_to_process.append(resource)
641 total = len(resources_to_process)
642 current = 0
644 logger.info(f"Found {total} resources needing text extraction")
646 for resource in resources_to_process:
647 current += 1
648 progress = (
649 int((current / total) * 100) if total > 0 else 100
650 )
652 file_name = (
653 resource.title[:50]
654 if resource
655 else f"document_{current}.txt"
656 )
658 try:
659 success, error = download_service.download_as_text(
660 resource.id
661 )
663 if success:
664 status = "success"
665 error_msg = None
666 else:
667 status = "failed"
668 error_msg = error or "Text extraction failed"
670 except Exception as e:
671 logger.exception(
672 f"Error extracting text for resource {resource.id}"
673 )
674 status = "failed"
675 error_msg = (
676 f"Text extraction failed - {type(e).__name__}"
677 )
679 # Send update
680 update = {
681 "progress": progress,
682 "current": current,
683 "total": total,
684 "file": file_name,
685 "url": resource.url, # Add the URL for UI display
686 "status": status,
687 "error": error_msg,
688 }
689 yield f"data: {json.dumps(update)}\n\n"
691 # Send completion
692 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n"
693 finally:
694 from ...utilities.resource_utils import safe_close
696 safe_close(download_service, "download service")
698 return Response(
699 stream_with_context(generate()), mimetype="text/event-stream"
700 )
703@library_bp.route("/api/download-research/<research_id>", methods=["POST"])
704@login_required
705def download_research_pdfs(research_id):
706 """Queue all PDFs from a research session for download."""
707 username = session["username"]
708 user_password = get_authenticated_user_password(username)
710 with DownloadService(username, user_password) as service:
711 # Get optional collection_id from request body
712 data = request.json or {}
713 collection_id = data.get("collection_id")
715 queued = service.queue_research_downloads(research_id, collection_id)
717 # Start processing queue (in production, this would be a background task)
718 # For now, we'll process synchronously
719 # TODO: Integrate with existing queue processor
721 return jsonify({"success": True, "queued": queued})
724@library_bp.route("/api/download-bulk", methods=["POST"])
725@login_required
726@require_json_body()
727def download_bulk():
728 """Download PDFs or extract text from multiple research sessions."""
729 username = session["username"]
730 data = request.json
731 research_ids = data.get("research_ids", [])
732 mode = data.get("mode", "pdf") # pdf or text_only
733 collection_id = data.get(
734 "collection_id"
735 ) # Optional: target collection for downloads
737 if not research_ids:
738 return jsonify({"error": "No research IDs provided"}), 400
740 # Capture Flask session ID to avoid scoping issues in nested function
741 flask_session_id = session.get("session_id")
743 def generate():
744 """Generate progress updates as Server-Sent Events."""
745 # Get user password for database operations
746 try:
747 user_password = get_authenticated_user_password(
748 username, flask_session_id
749 )
750 except AuthenticationRequiredError:
751 logger.warning(
752 f"Authentication unavailable for user {username} - password not in session store"
753 )
754 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n"
755 return
757 download_service = DownloadService(username, user_password)
758 try:
759 # Count total pending queue items across all research IDs
760 total = 0
761 current = 0
763 with get_user_db_session(username) as session:
764 for research_id in research_ids:
765 count = (
766 session.query(LibraryDownloadQueue)
767 .filter_by(
768 research_id=research_id,
769 status=DocumentStatus.PENDING,
770 )
771 .count()
772 )
773 total += count
774 logger.debug(
775 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue"
776 )
778 logger.info(
779 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}"
780 )
781 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n"
783 # Process each research
784 for research_id in research_ids:
785 # Get queued downloads for this research
786 with get_user_db_session(username) as session:
787 # Get pending queue items for this research
788 queue_items = (
789 session.query(LibraryDownloadQueue)
790 .filter_by(
791 research_id=research_id,
792 status=DocumentStatus.PENDING,
793 )
794 .all()
795 )
797 # If no items queued yet, queue them now
798 if not queue_items:
799 try:
800 download_service.queue_research_downloads(
801 research_id, collection_id
802 )
803 # Re-fetch queue items
804 queue_items = (
805 session.query(LibraryDownloadQueue)
806 .filter_by(
807 research_id=research_id, status="pending"
808 )
809 .all()
810 )
811 except Exception:
812 logger.exception(
813 f"Error queueing downloads for research {research_id}"
814 )
815 # Continue with empty queue_items
816 queue_items = []
818 # Process each queued item
819 for queue_item in queue_items:
820 logger.debug(
821 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})"
822 )
823 current += 1
824 logger.debug(
825 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})"
826 )
828 # Check for division issues
829 if total is None: 829 ↛ 830line 829 didn't jump to line 830 because the condition on line 829 was never true
830 logger.error(
831 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash"
832 )
833 total = 0
835 progress = (
836 int((current / total) * 100) if total > 0 else 100
837 )
838 logger.debug(
839 f"[PROGRESS_DEBUG] Calculated progress: {progress}%"
840 )
842 # Get resource info
843 resource = session.query(ResearchResource).get(
844 queue_item.resource_id
845 )
846 file_name = (
847 resource.title[:50]
848 if resource
849 else f"document_{current}.pdf"
850 )
852 # Attempt actual download with error handling
853 skip_reason = None
854 status = "skipped" # Default to skipped
855 success = False
856 error_msg = None
858 try:
859 logger.debug(
860 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}"
861 )
863 # Call appropriate service method based on mode
864 if mode == "pdf":
865 result = download_service.download_resource(
866 queue_item.resource_id
867 )
868 else: # text_only
869 result = download_service.download_as_text(
870 queue_item.resource_id
871 )
873 # Handle new tuple return format
874 if isinstance(result, tuple): 874 ↛ 877line 874 didn't jump to line 877 because the condition on line 874 was always true
875 success, skip_reason = result
876 else:
877 success = result
878 skip_reason = None
880 status = "success" if success else "skipped"
881 if skip_reason and not success: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true
882 error_msg = skip_reason
883 logger.info(
884 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}"
885 )
887 logger.debug(
888 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}"
889 )
890 except Exception as e:
891 # Log error but continue processing
892 error_msg = str(e)
893 error_type = type(e).__name__
894 logger.info(
895 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}"
896 )
897 # Check if this is a skip reason (not a real error)
898 # Use error category + categorized message for user display
899 if any(
900 phrase in error_msg.lower()
901 for phrase in [
902 "paywall",
903 "subscription",
904 "not available",
905 "not found",
906 "no free",
907 "embargoed",
908 "forbidden",
909 "not accessible",
910 ]
911 ):
912 status = "skipped"
913 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}"
914 elif any(
915 phrase in error_msg.lower()
916 for phrase in [
917 "failed to download",
918 "could not",
919 "invalid",
920 "server",
921 ]
922 ):
923 status = "failed"
924 skip_reason = f"Download failed - {error_type}"
925 else:
926 status = "failed"
927 skip_reason = (
928 f"Processing failed - {error_type}"
929 )
930 success = False
932 # Ensure skip_reason is set if we have an error message
933 if error_msg and not skip_reason: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true
934 skip_reason = f"Processing failed - {error_type}"
935 logger.debug(
936 f"Setting skip_reason from error_msg: {error_msg}"
937 )
939 # Send progress update
940 update_data = {
941 "progress": progress,
942 "current": current,
943 "total": total,
944 "file": file_name,
945 "status": status,
946 }
947 # Add skip reason if available
948 if skip_reason:
949 update_data["error"] = skip_reason
950 logger.info(
951 f"Sending skip reason to UI: {skip_reason}"
952 )
954 logger.info(f"Update data being sent: {update_data}")
955 yield f"data: {json.dumps(update_data)}\n\n"
957 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n"
958 finally:
959 from ...utilities.resource_utils import safe_close
961 safe_close(download_service, "download service")
963 return Response(
964 stream_with_context(generate()),
965 mimetype="text/event-stream",
966 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
967 )
970@library_bp.route("/api/research-list")
971@login_required
972def get_research_list():
973 """Get list of research sessions for dropdowns."""
974 username = session["username"]
975 service = LibraryService(username)
976 research_list = service.get_research_list_for_dropdown()
977 return jsonify({"research": research_list})
980@library_bp.route("/api/sync-library", methods=["POST"])
981@login_required
982def sync_library():
983 """Sync library database with filesystem."""
984 username = session["username"]
985 service = LibraryService(username)
986 stats = service.sync_library_with_filesystem()
987 return jsonify(stats)
990@library_bp.route("/api/mark-redownload", methods=["POST"])
991@login_required
992@require_json_body()
993def mark_for_redownload():
994 """Mark documents for re-download."""
995 username = session["username"]
996 service = LibraryService(username)
998 data = request.json
999 document_ids = data.get("document_ids", [])
1001 if not document_ids:
1002 return jsonify({"error": "No document IDs provided"}), 400
1004 count = service.mark_for_redownload(document_ids)
1005 return jsonify({"success": True, "marked": count})
1008@library_bp.route("/api/queue-all-undownloaded", methods=["POST"])
1009@login_required
1010def queue_all_undownloaded():
1011 """Queue all articles that haven't been downloaded yet."""
1012 username = session["username"]
1014 logger.info(f"queue_all_undownloaded called for user {username}")
1016 with get_user_db_session(username) as db_session:
1017 # Find all resources that don't have a completed download
1018 undownloaded = (
1019 db_session.query(ResearchResource)
1020 .outerjoin(
1021 Document,
1022 (
1023 (ResearchResource.id == Document.resource_id)
1024 | (ResearchResource.document_id == Document.id)
1025 )
1026 & (Document.status == "completed"),
1027 )
1028 .filter(Document.id.is_(None))
1029 .all()
1030 )
1032 logger.info(f"Found {len(undownloaded)} total undownloaded resources")
1034 # Get user password for encrypted database access
1035 user_password = get_authenticated_user_password(username)
1037 resource_filter = ResourceFilter(username, user_password)
1038 filter_results = resource_filter.filter_downloadable_resources(
1039 undownloaded
1040 )
1042 # Get detailed filtering summary
1043 filter_summary = resource_filter.get_filter_summary(undownloaded)
1044 skipped_info = resource_filter.get_skipped_resources_info(undownloaded)
1046 logger.info(f"Filter results: {filter_summary.to_dict()}")
1048 queued_count = 0
1049 research_ids = set()
1050 skipped_count = 0
1052 # Convert filter_results to dict for O(1) lookup instead of O(n²)
1053 filter_results_by_id = {r.resource_id: r for r in filter_results}
1055 for resource in undownloaded:
1056 # Check if resource passed the smart filter
1057 filter_result = filter_results_by_id.get(resource.id)
1059 if not filter_result or not filter_result.can_retry:
1060 skipped_count += 1
1061 if filter_result: 1061 ↛ 1062line 1061 didn't jump to line 1062 because the condition on line 1061 was never true
1062 logger.debug(
1063 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}"
1064 )
1065 else:
1066 logger.debug(
1067 f"Skipping resource {resource.id} - no filter result available"
1068 )
1069 continue
1071 # Check if it's downloadable using proper URL parsing
1072 if not resource.url:
1073 skipped_count += 1
1074 continue
1076 is_downloadable = is_downloadable_domain(resource.url)
1078 # Log what we're checking
1079 if resource.url and "pubmed" in resource.url.lower(): 1079 ↛ 1080line 1079 didn't jump to line 1080 because the condition on line 1079 was never true
1080 logger.info(f"Found PubMed URL: {resource.url[:100]}")
1082 if not is_downloadable: 1082 ↛ 1083line 1082 didn't jump to line 1083 because the condition on line 1082 was never true
1083 skipped_count += 1
1084 logger.debug(
1085 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}"
1086 )
1087 continue
1089 # Check if already in queue (any status)
1090 existing_queue = (
1091 db_session.query(LibraryDownloadQueue)
1092 .filter_by(resource_id=resource.id)
1093 .first()
1094 )
1096 if existing_queue:
1097 # If it exists but isn't pending, reset it to pending
1098 if existing_queue.status != DocumentStatus.PENDING:
1099 existing_queue.status = DocumentStatus.PENDING
1100 existing_queue.completed_at = None
1101 queued_count += 1
1102 research_ids.add(resource.research_id)
1103 logger.debug(
1104 f"Reset queue entry for resource {resource.id} to pending"
1105 )
1106 else:
1107 # Already pending, still count it
1108 queued_count += 1
1109 research_ids.add(resource.research_id)
1110 logger.debug(
1111 f"Resource {resource.id} already pending in queue"
1112 )
1113 else:
1114 # Add new entry to queue
1115 queue_entry = LibraryDownloadQueue(
1116 resource_id=resource.id,
1117 research_id=resource.research_id,
1118 priority=0,
1119 status=DocumentStatus.PENDING,
1120 )
1121 db_session.add(queue_entry)
1122 queued_count += 1
1123 research_ids.add(resource.research_id)
1124 logger.debug(
1125 f"Added new queue entry for resource {resource.id}"
1126 )
1128 db_session.commit()
1130 logger.info(
1131 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)"
1132 )
1134 # Note: Removed synchronous download processing here to avoid blocking the HTTP request
1135 # Downloads will be processed via the SSE streaming endpoint or background tasks
1137 return jsonify(
1138 {
1139 "success": True,
1140 "queued": queued_count,
1141 "research_ids": list(research_ids),
1142 "total_undownloaded": len(undownloaded),
1143 "skipped": skipped_count,
1144 "filter_summary": filter_summary.to_dict(),
1145 "skipped_details": skipped_info,
1146 }
1147 )
1150@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"])
1151@login_required
1152def get_research_sources(research_id):
1153 """Get all sources for a research with snippets."""
1154 username = session["username"]
1156 sources = []
1157 with get_user_db_session(username) as db_session:
1158 # Get all resources for this research
1159 resources = (
1160 db_session.query(ResearchResource)
1161 .filter_by(research_id=research_id)
1162 .order_by(ResearchResource.created_at)
1163 .all()
1164 )
1166 for idx, resource in enumerate(resources, 1):
1167 # Check if document exists
1168 document = get_document_for_resource(db_session, resource)
1170 # Get domain from URL
1171 domain = ""
1172 if resource.url:
1173 try:
1174 from urllib.parse import urlparse
1176 domain = urlparse(resource.url).hostname or ""
1177 except (ValueError, AttributeError):
1178 # urlparse can raise ValueError for malformed URLs
1179 pass
1181 source_data = {
1182 "number": idx,
1183 "resource_id": resource.id,
1184 "url": resource.url,
1185 "title": resource.title or f"Source {idx}",
1186 "snippet": resource.content_preview or "",
1187 "domain": domain,
1188 "relevance_score": getattr(resource, "relevance_score", None),
1189 "downloaded": False,
1190 "document_id": None,
1191 "file_type": None,
1192 }
1194 if document and document.status == "completed":
1195 source_data.update(
1196 {
1197 "downloaded": True,
1198 "document_id": document.id,
1199 "file_type": document.file_type,
1200 "download_date": document.created_at.isoformat()
1201 if document.created_at
1202 else None,
1203 }
1204 )
1206 sources.append(source_data)
1208 return jsonify({"success": True, "sources": sources, "total": len(sources)})
1211@library_bp.route("/api/check-downloads", methods=["POST"])
1212@login_required
1213@require_json_body()
1214def check_downloads():
1215 """Check download status for a list of URLs."""
1216 username = session["username"]
1217 data = request.json
1218 research_id = data.get("research_id")
1219 urls = data.get("urls", [])
1221 if not research_id or not urls:
1222 return jsonify({"error": "Missing research_id or urls"}), 400
1224 download_status = {}
1226 with get_user_db_session(username) as db_session:
1227 # Get all resources for this research
1228 resources = (
1229 db_session.query(ResearchResource)
1230 .filter_by(research_id=research_id)
1231 .filter(ResearchResource.url.in_(urls))
1232 .all()
1233 )
1235 for resource in resources:
1236 # Check if document exists
1237 document = get_document_for_resource(db_session, resource)
1239 if document and document.status == "completed":
1240 download_status[resource.url] = {
1241 "downloaded": True,
1242 "document_id": document.id,
1243 "file_path": document.file_path,
1244 "file_type": document.file_type,
1245 "title": document.title or resource.title,
1246 }
1247 else:
1248 download_status[resource.url] = {
1249 "downloaded": False,
1250 "resource_id": resource.id,
1251 }
1253 return jsonify({"download_status": download_status})
1256@library_bp.route("/api/download-source", methods=["POST"])
1257@login_required
1258@require_json_body()
1259def download_source():
1260 """Download a single source from a research."""
1261 username = session["username"]
1262 user_password = get_authenticated_user_password(username)
1263 data = request.json
1264 research_id = data.get("research_id")
1265 url = data.get("url")
1267 if not research_id or not url:
1268 return jsonify({"error": "Missing research_id or url"}), 400
1270 # Check if URL is downloadable
1271 if not is_downloadable_domain(url):
1272 return jsonify({"error": "URL is not from a downloadable domain"}), 400
1274 with get_user_db_session(username) as db_session:
1275 # Find the resource
1276 resource = (
1277 db_session.query(ResearchResource)
1278 .filter_by(research_id=research_id, url=url)
1279 .first()
1280 )
1282 if not resource:
1283 return jsonify({"error": "Resource not found"}), 404
1285 # Check if already downloaded
1286 existing = get_document_for_resource(db_session, resource)
1288 if existing and existing.status == "completed":
1289 return jsonify(
1290 {
1291 "success": True,
1292 "message": "Already downloaded",
1293 "document_id": existing.id,
1294 }
1295 )
1297 # Add to download queue
1298 queue_entry = (
1299 db_session.query(LibraryDownloadQueue)
1300 .filter_by(resource_id=resource.id)
1301 .first()
1302 )
1304 if not queue_entry: 1304 ↛ 1305line 1304 didn't jump to line 1305 because the condition on line 1304 was never true
1305 queue_entry = LibraryDownloadQueue(
1306 resource_id=resource.id,
1307 research_id=resource.research_id,
1308 priority=1, # Higher priority for manual downloads
1309 status=DocumentStatus.PENDING,
1310 )
1311 db_session.add(queue_entry)
1312 else:
1313 queue_entry.status = DocumentStatus.PENDING
1314 queue_entry.priority = 1
1316 db_session.commit()
1318 # Start download immediately
1319 with DownloadService(username, user_password) as service:
1320 success, message = service.download_resource(resource.id)
1322 if success:
1323 return jsonify(
1324 {"success": True, "message": "Download completed"}
1325 )
1326 # Log internal message, but show only generic message to user
1327 return jsonify({"success": False, "message": "Download failed"})