Coverage for src/local_deep_research/research_library/routes/library

1"""

2Routes for Research Library and Download Manager

4Provides web endpoints for:

5- Library browsing and management

6- Download manager interface

7- API endpoints for downloads and queries

8"""

10import json

11from io import BytesIO

12from pathlib import Path

13from urllib.parse import urlparse

14from flask import (

15 Blueprint,

16 g,

17 jsonify,

18 request,

19 session,

20 Response,

21 send_file,

22 stream_with_context,

23)

24from loguru import logger

26from ...web.auth.decorators import login_required

27from ...web.utils.templates import render_template_with_defaults

28from ...database.session_context import get_user_db_session

29from ...database.models.research import ResearchResource

30from ...database.models.library import (

31 Document as Document,

32 DocumentStatus,

33 DownloadQueue as LibraryDownloadQueue,

34 Collection,

35)

36from ...library.download_management import ResourceFilter

37from ..services.download_service import DownloadService

38from ..services.library_service import LibraryService

39from ..services.pdf_storage_manager import PDFStorageManager

40from ..utils import handle_api_error

41from ...utilities.db_utils import get_settings_manager

42from ...config.paths import get_library_directory

44# Create Blueprint

45library_bp = Blueprint("library", __name__, url_prefix="/library")

48# Error handler for authentication errors

49@library_bp.errorhandler(Exception)

50def handle_web_api_exception(error):

51 """Handle WebAPIException and its subclasses."""

52 from ...web.exceptions import WebAPIException

54 if isinstance(error, WebAPIException):

55 return jsonify(error.to_dict()), error.status_code

56 # Re-raise other exceptions

57 raise error

60def is_downloadable_domain(url: str) -> bool:

61 """Check if URL is from a downloadable academic domain using proper URL parsing."""

62 try:

63 if not url:

64 return False

66 parsed = urlparse(url.lower())

67 hostname = parsed.hostname or ""

68 path = parsed.path or ""

69 query = parsed.query or ""

71 # Check for direct PDF files

72 if path.endswith(".pdf") or ".pdf?" in url.lower():

73 return True

75 # List of downloadable academic domains

76 downloadable_domains = [

77 "arxiv.org",

78 "biorxiv.org",

79 "medrxiv.org",

80 "ncbi.nlm.nih.gov",

81 "pubmed.ncbi.nlm.nih.gov",

82 "europepmc.org",

83 "semanticscholar.org",

84 "researchgate.net",

85 "academia.edu",

86 "sciencedirect.com",

87 "springer.com",

88 "nature.com",

89 "wiley.com",

90 "ieee.org",

91 "acm.org",

92 "plos.org",

93 "frontiersin.org",

94 "mdpi.com",

95 "acs.org",

96 "rsc.org",

97 "tandfonline.com",

98 "sagepub.com",

99 "oxford.com",

100 "cambridge.org",

101 "bmj.com",

102 "nejm.org",

103 "thelancet.com",

104 "jamanetwork.com",

105 "annals.org",

106 "ahajournals.org",

107 "cell.com",

108 "science.org",

109 "pnas.org",

110 "elifesciences.org",

111 "embopress.org",

112 "journals.asm.org",

113 "microbiologyresearch.org",

114 "jvi.asm.org",

115 "genome.cshlp.org",

116 "genetics.org",

117 "g3journal.org",

118 "plantphysiol.org",

119 "plantcell.org",

120 "aspb.org",

121 "bioone.org",

122 "company-of-biologists.org",

123 "biologists.org",

124 "jeb.biologists.org",

125 "dmm.biologists.org",

126 "bio.biologists.org",

127 "doi.org",

128 "ssrn.com",

129 "openreview.net",

130 ]

131

132 # Check if hostname matches any downloadable domain

133 for domain in downloadable_domains:

134 if hostname == domain or hostname.endswith("." + domain):

135 return True

136

137 # Special case for PubMed which might appear in path

138 if "pubmed" in hostname or "/pubmed/" in path: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 return True

140

141 # Check for PDF in path or query parameters

142 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query:

143 return True

144

145 return False

146

147 except Exception as e:

148 logger.warning(f"Error parsing URL {url}: {e}")

149 return False

150

151

152def get_authenticated_user_password(

153 username: str, flask_session_id: str = None

154) -> str:

155 """

156 Get authenticated user password from session store with fallback to g.user_password.

157

158 Args:

159 username: The username to get password for

160 flask_session_id: Optional Flask session ID. If not provided, uses session.get("_id")

161

162 Returns:

163 str: The user's password

164

165 Raises:

166 AuthenticationRequiredError: If no password is available for the user

167 """

168 from ...database.session_passwords import session_password_store

169 from ...web.exceptions import AuthenticationRequiredError

170

171 session_id = flask_session_id or session.get("session_id")

172

173 # Try session password store first

174 try:

175 user_password = session_password_store.get_session_password(

176 username, session_id

177 )

178 if user_password:

179 logger.debug(

180 f"Retrieved user password from session store for user {username}"

181 )

182 return user_password

183 except Exception:

184 logger.exception("Failed to get user password from session store")

185

186 # Fallback to g.user_password (set by middleware if temp_auth was used)

187 user_password = getattr(g, "user_password", None)

188 if user_password:

189 logger.debug(

190 f"Retrieved user password from g.user_password fallback for user {username}"

191 )

192 return user_password

193

194 # No password available

195 logger.error(f"No user password available for user {username}")

196 raise AuthenticationRequiredError(

197 message="Authentication required: Please refresh the page and log in again to access encrypted database features.",

198 username=username,

199 )

200

201

202# ============= Page Routes =============

203

204

205@library_bp.route("/")

206@login_required

207def library_page():

208 """Main library page showing downloaded documents."""

209 username = session.get("username")

210 service = LibraryService(username)

211

212 # Get library settings

213 from ...utilities.db_utils import get_settings_manager

214

215 settings = get_settings_manager()

216 pdf_storage_mode = settings.get_setting(

217 "research_library.pdf_storage_mode", "database"

218 )

219 # Enable PDF storage button if mode is not "none"

220 enable_pdf_storage = pdf_storage_mode != "none"

221 shared_library = settings.get_setting(

222 "research_library.shared_library", False

223 )

224

225 # Get statistics

226 stats = service.get_library_stats()

227

228 # Get documents with optional filters

229 domain_filter = request.args.get("domain")

230 research_filter = request.args.get("research")

231 collection_filter = request.args.get("collection") # New collection filter

232

233 documents = service.get_documents(

234 research_id=research_filter,

235 domain=domain_filter,

236 collection_id=collection_filter,

237 limit=100,

238 )

239

240 # Get unique domains for filter dropdown

241 unique_domains = service.get_unique_domains()

242

243 # Get research list for filter dropdown

244 research_list = service.get_research_list_with_stats()

245

246 # Get collections list for filter dropdown

247 collections = service.get_all_collections()

248

249 return render_template_with_defaults(

250 "pages/library.html",

251 stats=stats,

252 documents=documents,

253 unique_domains=unique_domains,

254 research_list=research_list,

255 collections=collections,

256 selected_collection=collection_filter,

257 storage_path=stats.get("storage_path", ""),

258 enable_pdf_storage=enable_pdf_storage,

259 pdf_storage_mode=pdf_storage_mode,

260 shared_library=shared_library,

261 )

262

263

264@library_bp.route("/document/<string:document_id>")

265@login_required

266def document_details_page(document_id):

267 """Document details page showing all metadata and links."""

268 username = session.get("username")

269 service = LibraryService(username)

270

271 # Get document details

272 document = service.get_document_by_id(document_id)

273

274 if not document: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 return "Document not found", 404

276

277 return render_template_with_defaults(

278 "pages/document_details.html", document=document

279 )

280

281

282@library_bp.route("/download-manager")

283@login_required

284def download_manager_page():

285 """Download manager page for selecting and downloading research PDFs."""

286 username = session.get("username")

287 service = LibraryService(username)

288

289 # Get library settings

290 from ...utilities.db_utils import get_settings_manager

291

292 settings = get_settings_manager()

293 pdf_storage_mode = settings.get_setting(

294 "research_library.pdf_storage_mode", "database"

295 )

296 # Enable PDF storage button if mode is not "none"

297 enable_pdf_storage = pdf_storage_mode != "none"

298 shared_library = settings.get_setting(

299 "research_library.shared_library", False

300 )

301

302 # Get research sessions with statistics

303 research_list = service.get_research_list_with_stats()

304

305 # Calculate summary statistics

306 total_researches = len(research_list)

307 total_resources = sum(r["total_resources"] for r in research_list)

308 already_downloaded = sum(r["downloaded_count"] for r in research_list)

309 available_to_download = (

310 sum(r["downloadable_count"] for r in research_list) - already_downloaded

311 )

312

313 # Enrich research data with domain breakdowns

314 for research in research_list: 314 ↛ 316line 314 didn't jump to line 316 because the loop on line 314 never started

315 # Get PDF sources for this research

316 documents = service.get_documents(

317 research_id=research["id"], file_type="pdf"

318 )

319 research["pdf_sources"] = documents[:10] # Preview first 10

320

321 # Domain statistics

322 domains = {}

323 for doc in documents:

324 domain = doc.get("domain", "unknown")

325 if domain not in domains:

326 domains[domain] = {"total": 0, "pdfs": 0, "downloaded": 0}

327 domains[domain]["total"] += 1

328 if doc["file_type"] == "pdf":

329 domains[domain]["pdfs"] += 1

330 if doc["download_status"] == "completed":

331 domains[domain]["downloaded"] += 1

332

333 research["domains"] = domains

334

335 return render_template_with_defaults(

336 "pages/download_manager.html",

337 research_list=research_list,

338 total_researches=total_researches,

339 total_resources=total_resources,

340 already_downloaded=already_downloaded,

341 available_to_download=available_to_download,

342 enable_pdf_storage=enable_pdf_storage,

343 pdf_storage_mode=pdf_storage_mode,

344 shared_library=shared_library,

345 )

346

347

348# ============= API Routes =============

349

350

351@library_bp.route("/api/stats")

352@login_required

353def get_library_stats():

354 """Get library statistics."""

355 username = session.get("username")

356 service = LibraryService(username)

357 stats = service.get_library_stats()

358 return jsonify(stats)

359

360

361@library_bp.route("/api/collections/list")

362@login_required

363def get_collections_list():

364 """Get list of all collections for dropdown selection."""

365 username = session.get("username")

366

367 with get_user_db_session(username) as db_session:

368 collections = (

369 db_session.query(Collection).order_by(Collection.name).all()

370 )

371

372 return jsonify(

373 {

374 "success": True,

375 "collections": [

376 {

377 "id": col.id,

378 "name": col.name,

379 "description": col.description,

380 }

381 for col in collections

382 ],

383 }

384 )

385

386

387@library_bp.route("/api/documents")

388@login_required

389def get_documents():

390 """Get documents with filtering."""

391 username = session.get("username")

392 service = LibraryService(username)

393

394 # Get filter parameters

395 research_id = request.args.get("research_id")

396 domain = request.args.get("domain")

397 file_type = request.args.get("file_type")

398 favorites_only = request.args.get("favorites") == "true"

399 search_query = request.args.get("search")

400 limit = int(request.args.get("limit", 100))

401 offset = int(request.args.get("offset", 0))

402

403 documents = service.get_documents(

404 research_id=research_id,

405 domain=domain,

406 file_type=file_type,

407 favorites_only=favorites_only,

408 search_query=search_query,

409 limit=limit,

410 offset=offset,

411 )

412

413 return jsonify({"documents": documents})

414

415

416@library_bp.route(

417 "/api/document/<string:document_id>/favorite", methods=["POST"]

418)

419@login_required

420def toggle_favorite(document_id):

421 """Toggle favorite status of a document."""

422 username = session.get("username")

423 service = LibraryService(username)

424 is_favorite = service.toggle_favorite(document_id)

425 return jsonify({"favorite": is_favorite})

426

427

428@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"])

429@login_required

430def delete_document(document_id):

431 """Delete a document from library."""

432 username = session.get("username")

433 service = LibraryService(username)

434 success = service.delete_document(document_id)

435 return jsonify({"success": success})

436

437

438@library_bp.route("/api/document/<string:document_id>/pdf-url")

439@login_required

440def get_pdf_url(document_id):

441 """Get URL for viewing PDF."""

442 # Return URL that will serve the PDF

443 return jsonify(

444 {

445 "url": f"/library/api/document/{document_id}/pdf",

446 "title": "Document", # Could fetch actual title

447 }

448 )

449

450

451@library_bp.route("/document/<string:document_id>/pdf")

452@login_required

453def view_pdf_page(document_id):

454 """Page for viewing PDF file - uses PDFStorageManager for retrieval."""

455 username = session.get("username")

456

457 with get_user_db_session(username) as db_session:

458 # Get document from database

459 document = db_session.query(Document).filter_by(id=document_id).first()

460

461 if not document:

462 logger.warning(

463 f"Document ID {document_id} not found in database for user {username}"

464 )

465 return "Document not found", 404

466

467 logger.info(

468 f"Document {document_id}: title='{document.title}', "

469 f"file_path={document.file_path}"

470 )

471

472 # Get settings for PDF storage manager

473 settings = get_settings_manager(db_session)

474 storage_mode = settings.get_setting(

475 "research_library.pdf_storage_mode", "none"

476 )

477 library_root = Path(

478 settings.get_setting(

479 "research_library.storage_path",

480 str(get_library_directory()),

481 )

482 ).expanduser()

483

484 # Use PDFStorageManager to load PDF (handles database and filesystem)

485 pdf_manager = PDFStorageManager(library_root, storage_mode)

486 pdf_bytes = pdf_manager.load_pdf(document, db_session)

487

488 if pdf_bytes:

489 logger.info(

490 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)"

491 )

492 return send_file(

493 BytesIO(pdf_bytes),

494 mimetype="application/pdf",

495 as_attachment=False,

496 download_name=document.filename or "document.pdf",

497 )

498

499 # No PDF found anywhere

500 logger.warning(f"No PDF available for document {document_id}")

501 return "PDF not available", 404

502

503

504@library_bp.route("/api/document/<string:document_id>/pdf")

505@login_required

506def serve_pdf_api(document_id):

507 """API endpoint for serving PDF file (kept for backward compatibility)."""

508 return view_pdf_page(document_id)

509

510

511@library_bp.route("/document/<string:document_id>/txt")

512@login_required

513def view_text_page(document_id):

514 """Page for viewing text content."""

515 username = session.get("username")

516

517 with get_user_db_session(username) as db_session:

518 # Get document by ID (text now stored in Document.text_content)

519 document = db_session.query(Document).filter_by(id=document_id).first()

520

521 if not document:

522 logger.warning(f"Document not found for document ID {document_id}")

523 return "Document not found", 404

524

525 if not document.text_content:

526 logger.warning(f"Document {document_id} has no text content")

527 return "Text content not available", 404

528

529 logger.info(

530 f"Serving text content for document {document_id}: {len(document.text_content)} characters"

531 )

532

533 # Render as HTML page

534 return render_template_with_defaults(

535 "pages/document_text.html",

536 document_id=document_id,

537 title=document.title or "Document Text",

538 text_content=document.text_content,

539 extraction_method=document.extraction_method,

540 word_count=document.word_count,

541 )

542

543

544@library_bp.route("/api/document/<string:document_id>/text")

545@login_required

546def serve_text_api(document_id):

547 """API endpoint for serving text content (kept for backward compatibility)."""

548 username = session.get("username")

549

550 with get_user_db_session(username) as db_session:

551 # Get document by ID (text now stored in Document.text_content)

552 document = db_session.query(Document).filter_by(id=document_id).first()

553

554 if not document:

555 logger.warning(f"Document not found for document ID {document_id}")

556 return jsonify({"error": "Document not found"}), 404

557

558 if not document.text_content:

559 logger.warning(f"Document {document_id} has no text content")

560 return jsonify({"error": "Text content not available"}), 404

561

562 logger.info(

563 f"Serving text content for document {document_id}: {len(document.text_content)} characters"

564 )

565

566 return jsonify(

567 {

568 "text_content": document.text_content,

569 "title": document.title or "Document",

570 "extraction_method": document.extraction_method,

571 "word_count": document.word_count,

572 }

573 )

574

575

576@library_bp.route("/api/open-folder", methods=["POST"])

577@login_required

578def open_folder():

579 """Open folder containing a document.

580

581 Security: This endpoint is disabled for server deployments.

582 It only makes sense for desktop usage where the server and client are on the same machine.

583 """

584 return jsonify(

585 {

586 "status": "error",

587 "message": "This feature is disabled. It is only available in desktop mode.",

588 }

589 ), 403

590

591

592@library_bp.route("/api/download/<int:resource_id>", methods=["POST"])

593@login_required

594def download_single_resource(resource_id):

595 """Download a single resource."""

596 username = session.get("username")

597 user_password = get_authenticated_user_password(username)

598

599 with DownloadService(username, user_password) as service:

600 success, error = service.download_resource(resource_id)

601 if success:

602 return jsonify({"success": True})

603 else:

604 logger.warning(

605 f"Download failed for resource {resource_id}: {error}"

606 )

607 return jsonify(

608 {

609 "success": False,

610 "error": "Download failed. Please try again or contact support.",

611 }

612 ), 500

613

614

615@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"])

616@login_required

617def download_text_single(resource_id):

618 """Download a single resource as text file."""

619 try:

620 username = session.get("username")

621 user_password = get_authenticated_user_password(username)

622

623 with DownloadService(username, user_password) as service:

624 success, error = service.download_as_text(resource_id)

625

626 # Sanitize error message - don't expose internal details

627 if not success:

628 if error:

629 logger.warning(

630 f"Download as text failed for resource {resource_id}: {error}"

631 )

632 return jsonify(

633 {"success": False, "error": "Failed to download resource"}

634 )

635

636 return jsonify({"success": True, "error": None})

637 except Exception as e:

638 return handle_api_error(

639 f"downloading resource {resource_id} as text", e

640 )

641

642

643@library_bp.route("/api/download-all-text", methods=["POST"])

644@login_required

645def download_all_text():

646 """Download all undownloaded resources as text files."""

647 username = session.get("username")

648 # Capture Flask session ID to avoid scoping issues in nested function

649 flask_session_id = session.get("session_id")

650

651 def generate():

652 # Get user password for database operations

653 from ...web.exceptions import AuthenticationRequiredError

654

655 try:

656 user_password = get_authenticated_user_password(

657 username, flask_session_id

658 )

659 except AuthenticationRequiredError:

660 logger.warning(f"Authentication expired for user {username}")

661 return

662

663 download_service = DownloadService(username, user_password)

664 try:

665 # Get all undownloaded resources

666 with get_user_db_session(username) as session:

667 # Get resources that don't have text files yet

668 resources = session.query(ResearchResource).all()

669

670 # Filter resources that need text extraction

671 txt_path = Path(download_service.library_root) / "txt"

672 resources_to_process = []

673

674 # Pre-scan directory once to get all existing resource IDs

675 existing_resource_ids = set()

676 if txt_path.exists():

677 for txt_file in txt_path.glob("*.txt"):

678 # Extract resource ID from filename pattern *_{id}.txt

679 parts = txt_file.stem.rsplit("_", 1)

680 if len(parts) == 2:

681 try:

682 existing_resource_ids.add(int(parts[1]))

683 except ValueError:

684 pass

685

686 for resource in resources:

687 # Check if text file already exists using preloaded set

688 if resource.id not in existing_resource_ids:

689 resources_to_process.append(resource)

690

691 total = len(resources_to_process)

692 current = 0

693

694 logger.info(f"Found {total} resources needing text extraction")

695

696 for resource in resources_to_process:

697 current += 1

698 progress = (

699 int((current / total) * 100) if total > 0 else 100

700 )

701

702 file_name = (

703 resource.title[:50]

704 if resource

705 else f"document_{current}.txt"

706 )

707

708 try:

709 success, error = download_service.download_as_text(

710 resource.id

711 )

712

713 if success:

714 status = "success"

715 error_msg = None

716 else:

717 status = "failed"

718 error_msg = error or "Text extraction failed"

719

720 except Exception as e:

721 logger.exception(

722 f"Error extracting text for resource {resource.id}"

723 )

724 status = "failed"

725 error_msg = (

726 f"Text extraction failed - {type(e).__name__}"

727 )

728

729 # Send update

730 update = {

731 "progress": progress,

732 "current": current,

733 "total": total,

734 "file": file_name,

735 "url": resource.url, # Add the URL for UI display

736 "status": status,

737 "error": error_msg,

738 }

739 yield f"data: {json.dumps(update)}\n\n"

740

741 # Send completion

742 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n"

743 finally:

744 download_service.close()

745

746 return Response(

747 stream_with_context(generate()), mimetype="text/event-stream"

748 )

749

750

751@library_bp.route("/api/download-research/<research_id>", methods=["POST"])

752@login_required

753def download_research_pdfs(research_id):

754 """Queue all PDFs from a research session for download."""

755 username = session.get("username")

756 user_password = get_authenticated_user_password(username)

757

758 with DownloadService(username, user_password) as service:

759 # Get optional collection_id from request body

760 data = request.json or {}

761 collection_id = data.get("collection_id")

762

763 queued = service.queue_research_downloads(research_id, collection_id)

764

765 # Start processing queue (in production, this would be a background task)

766 # For now, we'll process synchronously

767 # TODO: Integrate with existing queue processor

768

769 return jsonify({"success": True, "queued": queued})

770

771

772@library_bp.route("/api/download-bulk", methods=["POST"])

773@login_required

774def download_bulk():

775 """Download PDFs or extract text from multiple research sessions."""

776 username = session.get("username")

777 data = request.json

778 research_ids = data.get("research_ids", [])

779 mode = data.get("mode", "pdf") # pdf or text_only

780 collection_id = data.get(

781 "collection_id"

782 ) # Optional: target collection for downloads

783

784 if not research_ids:

785 return jsonify({"error": "No research IDs provided"}), 400

786

787 # Capture Flask session ID to avoid scoping issues in nested function

788 flask_session_id = session.get("session_id")

789

790 def generate():

791 """Generate progress updates as Server-Sent Events."""

792 # Get user password for database operations

793 from ...web.exceptions import AuthenticationRequiredError

794

795 try:

796 user_password = get_authenticated_user_password(

797 username, flask_session_id

798 )

799 except AuthenticationRequiredError:

800 return

801

802 download_service = DownloadService(username, user_password)

803 try:

804 # Count total pending queue items across all research IDs

805 total = 0

806 current = 0

807

808 with get_user_db_session(username) as session:

809 for research_id in research_ids:

810 count = (

811 session.query(LibraryDownloadQueue)

812 .filter_by(

813 research_id=research_id,

814 status=DocumentStatus.PENDING,

815 )

816 .count()

817 )

818 total += count

819 logger.debug(

820 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue"

821 )

822

823 logger.info(

824 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}"

825 )

826 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n"

827

828 # Process each research

829 for research_id in research_ids:

830 # Get queued downloads for this research

831 with get_user_db_session(username) as session:

832 # Get pending queue items for this research

833 queue_items = (

834 session.query(LibraryDownloadQueue)

835 .filter_by(

836 research_id=research_id,

837 status=DocumentStatus.PENDING,

838 )

839 .all()

840 )

841

842 # If no items queued yet, queue them now

843 if not queue_items:

844 try:

845 download_service.queue_research_downloads(

846 research_id, collection_id

847 )

848 # Re-fetch queue items

849 queue_items = (

850 session.query(LibraryDownloadQueue)

851 .filter_by(

852 research_id=research_id, status="pending"

853 )

854 .all()

855 )

856 except Exception:

857 logger.exception(

858 f"Error queueing downloads for research {research_id}"

859 )

860 # Continue with empty queue_items

861 queue_items = []

862

863 # Process each queued item

864 for queue_item in queue_items:

865 logger.debug(

866 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})"

867 )

868 current += 1

869 logger.debug(

870 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})"

871 )

872

873 # Check for division issues

874 if total is None:

875 logger.error(

876 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash"

877 )

878 total = 0

879

880 progress = (

881 int((current / total) * 100) if total > 0 else 100

882 )

883 logger.debug(

884 f"[PROGRESS_DEBUG] Calculated progress: {progress}%"

885 )

886

887 # Get resource info

888 resource = session.query(ResearchResource).get(

889 queue_item.resource_id

890 )

891 file_name = (

892 resource.title[:50]

893 if resource

894 else f"document_{current}.pdf"

895 )

896

897 # Attempt actual download with error handling

898 skip_reason = None

899 status = "skipped" # Default to skipped

900 success = False

901 error_msg = None

902

903 try:

904 logger.debug(

905 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}"

906 )

907

908 # Call appropriate service method based on mode

909 if mode == "pdf":

910 result = download_service.download_resource(

911 queue_item.resource_id

912 )

913 else: # text_only

914 result = download_service.download_as_text(

915 queue_item.resource_id

916 )

917

918 # Handle new tuple return format

919 if isinstance(result, tuple):

920 success, skip_reason = result

921 else:

922 success = result

923 skip_reason = None

924

925 status = "success" if success else "skipped"

926 if skip_reason and not success:

927 error_msg = skip_reason

928 logger.info(

929 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}"

930 )

931

932 logger.debug(

933 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}"

934 )

935 except Exception as e:

936 # Log error but continue processing

937 error_msg = str(e)

938 error_type = type(e).__name__

939 logger.info(

940 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}"

941 )

942 # Check if this is a skip reason (not a real error)

943 # Use error category + categorized message for user display

944 if any(

945 phrase in error_msg.lower()

946 for phrase in [

947 "paywall",

948 "subscription",

949 "not available",

950 "not found",

951 "no free",

952 "embargoed",

953 "forbidden",

954 "not accessible",

955 ]

956 ):

957 status = "skipped"

958 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}"

959 elif any(

960 phrase in error_msg.lower()

961 for phrase in [

962 "failed to download",

963 "could not",

964 "invalid",

965 "server",

966 ]

967 ):

968 status = "failed"

969 skip_reason = f"Download failed - {error_type}"

970 else:

971 status = "failed"

972 skip_reason = (

973 f"Processing failed - {error_type}"

974 )

975 success = False

976

977 # Ensure skip_reason is set if we have an error message

978 if error_msg and not skip_reason:

979 skip_reason = f"Processing failed - {error_type}"

980 logger.debug(

981 f"Setting skip_reason from error_msg: {error_msg}"

982 )

983

984 # Send progress update

985 update_data = {

986 "progress": progress,

987 "current": current,

988 "total": total,

989 "file": file_name,

990 "status": status,

991 }

992 # Add skip reason if available

993 if skip_reason:

994 update_data["error"] = skip_reason

995 logger.info(

996 f"Sending skip reason to UI: {skip_reason}"

997 )

998

999 logger.info(f"Update data being sent: {update_data}")

1000 yield f"data: {json.dumps(update_data)}\n\n"

1001

1002 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n"

1003 finally:

1004 download_service.close()

1005

1006 return Response(

1007 stream_with_context(generate()),

1008 mimetype="text/event-stream",

1009 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},

1010 )

1011

1012

1013@library_bp.route("/api/research-list")

1014@login_required

1015def get_research_list():

1016 """Get list of research sessions with download stats."""

1017 username = session.get("username")

1018 service = LibraryService(username)

1019 research_list = service.get_research_list_with_stats()

1020 return jsonify({"research": research_list})

1021

1022

1023@library_bp.route("/api/sync-library", methods=["POST"])

1024@login_required

1025def sync_library():

1026 """Sync library database with filesystem."""

1027 username = session.get("username")

1028 service = LibraryService(username)

1029 stats = service.sync_library_with_filesystem()

1030 return jsonify(stats)

1031

1032

1033@library_bp.route("/api/mark-redownload", methods=["POST"])

1034@login_required

1035def mark_for_redownload():

1036 """Mark documents for re-download."""

1037 username = session.get("username")

1038 service = LibraryService(username)

1039

1040 data = request.json

1041 document_ids = data.get("document_ids", [])

1042

1043 if not document_ids:

1044 return jsonify({"error": "No document IDs provided"}), 400

1045

1046 count = service.mark_for_redownload(document_ids)

1047 return jsonify({"success": True, "marked": count})

1048

1049

1050@library_bp.route("/api/queue-all-undownloaded", methods=["POST"])

1051@login_required

1052def queue_all_undownloaded():

1053 """Queue all articles that haven't been downloaded yet."""

1054 username = session.get("username")

1055

1056 logger.info(f"queue_all_undownloaded called for user {username}")

1057

1058 with get_user_db_session(username) as db_session:

1059 # Find all resources that don't have a completed download

1060 undownloaded = (

1061 db_session.query(ResearchResource)

1062 .outerjoin(

1063 Document,

1064 (ResearchResource.id == Document.resource_id)

1065 & (Document.status == "completed"),

1066 )

1067 .filter(Document.id.is_(None))

1068 .all()

1069 )

1070

1071 logger.info(f"Found {len(undownloaded)} total undownloaded resources")

1072

1073 # Get user password for encrypted database access

1074 user_password = get_authenticated_user_password(username)

1075

1076 resource_filter = ResourceFilter(username, user_password)

1077 filter_results = resource_filter.filter_downloadable_resources(

1078 undownloaded

1079 )

1080

1081 # Get detailed filtering summary

1082 filter_summary = resource_filter.get_filter_summary(undownloaded)

1083 skipped_info = resource_filter.get_skipped_resources_info(undownloaded)

1084

1085 logger.info(f"Filter results: {filter_summary.to_dict()}")

1086

1087 queued_count = 0

1088 research_ids = set()

1089 skipped_count = 0

1090

1091 # Convert filter_results to dict for O(1) lookup instead of O(n²)

1092 filter_results_by_id = {r.resource_id: r for r in filter_results}

1093

1094 for resource in undownloaded:

1095 # Check if resource passed the smart filter

1096 filter_result = filter_results_by_id.get(resource.id)

1097

1098 if not filter_result or not filter_result.can_retry:

1099 skipped_count += 1

1100 if filter_result:

1101 logger.debug(

1102 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}"

1103 )

1104 else:

1105 logger.debug(

1106 f"Skipping resource {resource.id} - no filter result available"

1107 )

1108 continue

1109

1110 # Check if it's downloadable using proper URL parsing

1111 if not resource.url:

1112 skipped_count += 1

1113 continue

1114

1115 is_downloadable = is_downloadable_domain(resource.url)

1116

1117 # Log what we're checking

1118 if resource.url and "pubmed" in resource.url.lower():

1119 logger.info(f"Found PubMed URL: {resource.url[:100]}")

1120

1121 if not is_downloadable:

1122 skipped_count += 1

1123 logger.debug(

1124 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}"

1125 )

1126 continue

1127

1128 # Check if already in queue (any status)

1129 existing_queue = (

1130 db_session.query(LibraryDownloadQueue)

1131 .filter_by(resource_id=resource.id)

1132 .first()

1133 )

1134

1135 if existing_queue:

1136 # If it exists but isn't pending, reset it to pending

1137 if existing_queue.status != DocumentStatus.PENDING:

1138 existing_queue.status = DocumentStatus.PENDING

1139 existing_queue.completed_at = None

1140 queued_count += 1

1141 research_ids.add(resource.research_id)

1142 logger.debug(

1143 f"Reset queue entry for resource {resource.id} to pending"

1144 )

1145 else:

1146 # Already pending, still count it

1147 queued_count += 1

1148 research_ids.add(resource.research_id)

1149 logger.debug(

1150 f"Resource {resource.id} already pending in queue"

1151 )

1152 else:

1153 # Add new entry to queue

1154 queue_entry = LibraryDownloadQueue(

1155 resource_id=resource.id,

1156 research_id=resource.research_id,

1157 priority=0,

1158 status=DocumentStatus.PENDING,

1159 )

1160 db_session.add(queue_entry)

1161 queued_count += 1

1162 research_ids.add(resource.research_id)

1163 logger.debug(

1164 f"Added new queue entry for resource {resource.id}"

1165 )

1166

1167 db_session.commit()

1168

1169 logger.info(

1170 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)"

1171 )

1172

1173 # Note: Removed synchronous download processing here to avoid blocking the HTTP request

1174 # Downloads will be processed via the SSE streaming endpoint or background tasks

1175

1176 return jsonify(

1177 {

1178 "success": True,

1179 "queued": queued_count,

1180 "research_ids": list(research_ids),

1181 "total_undownloaded": len(undownloaded),

1182 "skipped": skipped_count,

1183 "filter_summary": filter_summary.to_dict(),

1184 "skipped_details": skipped_info,

1185 }

1186 )

1187

1188

1189@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"])

1190@login_required

1191def get_research_sources(research_id):

1192 """Get all sources for a research with snippets."""

1193 username = session.get("username")

1194

1195 sources = []

1196 with get_user_db_session(username) as db_session:

1197 # Get all resources for this research

1198 resources = (

1199 db_session.query(ResearchResource)

1200 .filter_by(research_id=research_id)

1201 .order_by(ResearchResource.created_at)

1202 .all()

1203 )

1204

1205 for idx, resource in enumerate(resources, 1): 1205 ↛ 1207line 1205 didn't jump to line 1207 because the loop on line 1205 never started

1206 # Check if document exists

1207 document = (

1208 db_session.query(Document)

1209 .filter_by(resource_id=resource.id)

1210 .first()

1211 )

1212

1213 # Get domain from URL

1214 domain = ""

1215 if resource.url:

1216 try:

1217 from urllib.parse import urlparse

1218

1219 domain = urlparse(resource.url).hostname or ""

1220 except (ValueError, AttributeError):

1221 # urlparse can raise ValueError for malformed URLs

1222 pass

1223

1224 source_data = {

1225 "number": idx,

1226 "resource_id": resource.id,

1227 "url": resource.url,

1228 "title": resource.title or f"Source {idx}",

1229 "snippet": resource.content_preview or "",

1230 "domain": domain,

1231 "relevance_score": getattr(resource, "relevance_score", None),

1232 "downloaded": False,

1233 "document_id": None,

1234 "file_type": None,

1235 }

1236

1237 if document and document.status == "completed":

1238 source_data.update(

1239 {

1240 "downloaded": True,

1241 "document_id": document.id,

1242 "file_type": document.file_type,

1243 "download_date": document.created_at.isoformat()

1244 if document.created_at

1245 else None,

1246 }

1247 )

1248

1249 sources.append(source_data)

1250

1251 return jsonify({"success": True, "sources": sources, "total": len(sources)})

1252

1253

1254@library_bp.route("/api/check-downloads", methods=["POST"])

1255@login_required

1256def check_downloads():

1257 """Check download status for a list of URLs."""

1258 username = session.get("username")

1259 data = request.json

1260 research_id = data.get("research_id")

1261 urls = data.get("urls", [])

1262

1263 if not research_id or not urls: 1263 ↛ 1266line 1263 didn't jump to line 1266 because the condition on line 1263 was always true

1264 return jsonify({"error": "Missing research_id or urls"}), 400

1265

1266 download_status = {}

1267

1268 with get_user_db_session(username) as db_session:

1269 # Get all resources for this research

1270 resources = (

1271 db_session.query(ResearchResource)

1272 .filter_by(research_id=research_id)

1273 .filter(ResearchResource.url.in_(urls))

1274 .all()

1275 )

1276

1277 for resource in resources:

1278 # Check if document exists

1279 document = (

1280 db_session.query(Document)

1281 .filter_by(resource_id=resource.id)

1282 .first()

1283 )

1284

1285 if document and document.status == "completed":

1286 download_status[resource.url] = {

1287 "downloaded": True,

1288 "document_id": document.id,

1289 "file_path": document.file_path,

1290 "file_type": document.file_type,

1291 "title": document.title or resource.title,

1292 }

1293 else:

1294 download_status[resource.url] = {

1295 "downloaded": False,

1296 "resource_id": resource.id,

1297 }

1298

1299 return jsonify({"download_status": download_status})

1300

1301

1302@library_bp.route("/api/download-source", methods=["POST"])

1303@login_required

1304def download_source():

1305 """Download a single source from a research."""

1306 username = session.get("username")

1307 user_password = get_authenticated_user_password(username)

1308 data = request.json

1309 research_id = data.get("research_id")

1310 url = data.get("url")

1311

1312 if not research_id or not url: 1312 ↛ 1316line 1312 didn't jump to line 1316 because the condition on line 1312 was always true

1313 return jsonify({"error": "Missing research_id or url"}), 400

1314

1315 # Check if URL is downloadable

1316 if not is_downloadable_domain(url):

1317 return jsonify({"error": "URL is not from a downloadable domain"}), 400

1318

1319 with get_user_db_session(username) as db_session:

1320 # Find the resource

1321 resource = (

1322 db_session.query(ResearchResource)

1323 .filter_by(research_id=research_id, url=url)

1324 .first()

1325 )

1326

1327 if not resource:

1328 return jsonify({"error": "Resource not found"}), 404

1329

1330 # Check if already downloaded

1331 existing = (

1332 db_session.query(Document)

1333 .filter_by(resource_id=resource.id)

1334 .first()

1335 )

1336

1337 if existing and existing.download_status == "completed":

1338 return jsonify(

1339 {

1340 "success": True,

1341 "message": "Already downloaded",

1342 "document_id": existing.id,

1343 }

1344 )

1345

1346 # Add to download queue

1347 queue_entry = (

1348 db_session.query(LibraryDownloadQueue)

1349 .filter_by(resource_id=resource.id)

1350 .first()

1351 )

1352

1353 if not queue_entry:

1354 queue_entry = LibraryDownloadQueue(

1355 resource_id=resource.id,

1356 research_id=resource.research_id,

1357 priority=1, # Higher priority for manual downloads

1358 status=DocumentStatus.PENDING,

1359 )

1360 db_session.add(queue_entry)

1361 else:

1362 queue_entry.status = DocumentStatus.PENDING

1363 queue_entry.priority = 1

1364

1365 db_session.commit()

1366

1367 # Start download immediately

1368 with DownloadService(username, user_password) as service:

1369 success, message = service.download_resource(resource.id)

1370

1371 if success:

1372 return jsonify(

1373 {"success": True, "message": "Download completed"}

1374 )

1375 else:

1376 # Log internal message, but show only generic message to user

1377 return jsonify({"success": False, "message": "Download failed"})

Coverage for src / local_deep_research / research_library / routes / library_routes.py: 42%

555 statements