Coverage for src / local_deep_research / research_library / routes / library_routes.py: 42%

555 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Routes for Research Library and Download Manager 

3 

4Provides web endpoints for: 

5- Library browsing and management 

6- Download manager interface 

7- API endpoints for downloads and queries 

8""" 

9 

10import json 

11from io import BytesIO 

12from pathlib import Path 

13from urllib.parse import urlparse 

14from flask import ( 

15 Blueprint, 

16 g, 

17 jsonify, 

18 request, 

19 session, 

20 Response, 

21 send_file, 

22 stream_with_context, 

23) 

24from loguru import logger 

25 

26from ...web.auth.decorators import login_required 

27from ...web.utils.templates import render_template_with_defaults 

28from ...database.session_context import get_user_db_session 

29from ...database.models.research import ResearchResource 

30from ...database.models.library import ( 

31 Document as Document, 

32 DocumentStatus, 

33 DownloadQueue as LibraryDownloadQueue, 

34 Collection, 

35) 

36from ...library.download_management import ResourceFilter 

37from ..services.download_service import DownloadService 

38from ..services.library_service import LibraryService 

39from ..services.pdf_storage_manager import PDFStorageManager 

40from ..utils import handle_api_error 

41from ...utilities.db_utils import get_settings_manager 

42from ...config.paths import get_library_directory 

43 

44# Create Blueprint 

45library_bp = Blueprint("library", __name__, url_prefix="/library") 

46 

47 

48# Error handler for authentication errors 

49@library_bp.errorhandler(Exception) 

50def handle_web_api_exception(error): 

51 """Handle WebAPIException and its subclasses.""" 

52 from ...web.exceptions import WebAPIException 

53 

54 if isinstance(error, WebAPIException): 

55 return jsonify(error.to_dict()), error.status_code 

56 # Re-raise other exceptions 

57 raise error 

58 

59 

60def is_downloadable_domain(url: str) -> bool: 

61 """Check if URL is from a downloadable academic domain using proper URL parsing.""" 

62 try: 

63 if not url: 

64 return False 

65 

66 parsed = urlparse(url.lower()) 

67 hostname = parsed.hostname or "" 

68 path = parsed.path or "" 

69 query = parsed.query or "" 

70 

71 # Check for direct PDF files 

72 if path.endswith(".pdf") or ".pdf?" in url.lower(): 

73 return True 

74 

75 # List of downloadable academic domains 

76 downloadable_domains = [ 

77 "arxiv.org", 

78 "biorxiv.org", 

79 "medrxiv.org", 

80 "ncbi.nlm.nih.gov", 

81 "pubmed.ncbi.nlm.nih.gov", 

82 "europepmc.org", 

83 "semanticscholar.org", 

84 "researchgate.net", 

85 "academia.edu", 

86 "sciencedirect.com", 

87 "springer.com", 

88 "nature.com", 

89 "wiley.com", 

90 "ieee.org", 

91 "acm.org", 

92 "plos.org", 

93 "frontiersin.org", 

94 "mdpi.com", 

95 "acs.org", 

96 "rsc.org", 

97 "tandfonline.com", 

98 "sagepub.com", 

99 "oxford.com", 

100 "cambridge.org", 

101 "bmj.com", 

102 "nejm.org", 

103 "thelancet.com", 

104 "jamanetwork.com", 

105 "annals.org", 

106 "ahajournals.org", 

107 "cell.com", 

108 "science.org", 

109 "pnas.org", 

110 "elifesciences.org", 

111 "embopress.org", 

112 "journals.asm.org", 

113 "microbiologyresearch.org", 

114 "jvi.asm.org", 

115 "genome.cshlp.org", 

116 "genetics.org", 

117 "g3journal.org", 

118 "plantphysiol.org", 

119 "plantcell.org", 

120 "aspb.org", 

121 "bioone.org", 

122 "company-of-biologists.org", 

123 "biologists.org", 

124 "jeb.biologists.org", 

125 "dmm.biologists.org", 

126 "bio.biologists.org", 

127 "doi.org", 

128 "ssrn.com", 

129 "openreview.net", 

130 ] 

131 

132 # Check if hostname matches any downloadable domain 

133 for domain in downloadable_domains: 

134 if hostname == domain or hostname.endswith("." + domain): 

135 return True 

136 

137 # Special case for PubMed which might appear in path 

138 if "pubmed" in hostname or "/pubmed/" in path: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 return True 

140 

141 # Check for PDF in path or query parameters 

142 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query: 

143 return True 

144 

145 return False 

146 

147 except Exception as e: 

148 logger.warning(f"Error parsing URL {url}: {e}") 

149 return False 

150 

151 

152def get_authenticated_user_password( 

153 username: str, flask_session_id: str = None 

154) -> str: 

155 """ 

156 Get authenticated user password from session store with fallback to g.user_password. 

157 

158 Args: 

159 username: The username to get password for 

160 flask_session_id: Optional Flask session ID. If not provided, uses session.get("_id") 

161 

162 Returns: 

163 str: The user's password 

164 

165 Raises: 

166 AuthenticationRequiredError: If no password is available for the user 

167 """ 

168 from ...database.session_passwords import session_password_store 

169 from ...web.exceptions import AuthenticationRequiredError 

170 

171 session_id = flask_session_id or session.get("session_id") 

172 

173 # Try session password store first 

174 try: 

175 user_password = session_password_store.get_session_password( 

176 username, session_id 

177 ) 

178 if user_password: 

179 logger.debug( 

180 f"Retrieved user password from session store for user {username}" 

181 ) 

182 return user_password 

183 except Exception: 

184 logger.exception("Failed to get user password from session store") 

185 

186 # Fallback to g.user_password (set by middleware if temp_auth was used) 

187 user_password = getattr(g, "user_password", None) 

188 if user_password: 

189 logger.debug( 

190 f"Retrieved user password from g.user_password fallback for user {username}" 

191 ) 

192 return user_password 

193 

194 # No password available 

195 logger.error(f"No user password available for user {username}") 

196 raise AuthenticationRequiredError( 

197 message="Authentication required: Please refresh the page and log in again to access encrypted database features.", 

198 username=username, 

199 ) 

200 

201 

202# ============= Page Routes ============= 

203 

204 

205@library_bp.route("/") 

206@login_required 

207def library_page(): 

208 """Main library page showing downloaded documents.""" 

209 username = session.get("username") 

210 service = LibraryService(username) 

211 

212 # Get library settings 

213 from ...utilities.db_utils import get_settings_manager 

214 

215 settings = get_settings_manager() 

216 pdf_storage_mode = settings.get_setting( 

217 "research_library.pdf_storage_mode", "database" 

218 ) 

219 # Enable PDF storage button if mode is not "none" 

220 enable_pdf_storage = pdf_storage_mode != "none" 

221 shared_library = settings.get_setting( 

222 "research_library.shared_library", False 

223 ) 

224 

225 # Get statistics 

226 stats = service.get_library_stats() 

227 

228 # Get documents with optional filters 

229 domain_filter = request.args.get("domain") 

230 research_filter = request.args.get("research") 

231 collection_filter = request.args.get("collection") # New collection filter 

232 

233 documents = service.get_documents( 

234 research_id=research_filter, 

235 domain=domain_filter, 

236 collection_id=collection_filter, 

237 limit=100, 

238 ) 

239 

240 # Get unique domains for filter dropdown 

241 unique_domains = service.get_unique_domains() 

242 

243 # Get research list for filter dropdown 

244 research_list = service.get_research_list_with_stats() 

245 

246 # Get collections list for filter dropdown 

247 collections = service.get_all_collections() 

248 

249 return render_template_with_defaults( 

250 "pages/library.html", 

251 stats=stats, 

252 documents=documents, 

253 unique_domains=unique_domains, 

254 research_list=research_list, 

255 collections=collections, 

256 selected_collection=collection_filter, 

257 storage_path=stats.get("storage_path", ""), 

258 enable_pdf_storage=enable_pdf_storage, 

259 pdf_storage_mode=pdf_storage_mode, 

260 shared_library=shared_library, 

261 ) 

262 

263 

264@library_bp.route("/document/<string:document_id>") 

265@login_required 

266def document_details_page(document_id): 

267 """Document details page showing all metadata and links.""" 

268 username = session.get("username") 

269 service = LibraryService(username) 

270 

271 # Get document details 

272 document = service.get_document_by_id(document_id) 

273 

274 if not document: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 return "Document not found", 404 

276 

277 return render_template_with_defaults( 

278 "pages/document_details.html", document=document 

279 ) 

280 

281 

282@library_bp.route("/download-manager") 

283@login_required 

284def download_manager_page(): 

285 """Download manager page for selecting and downloading research PDFs.""" 

286 username = session.get("username") 

287 service = LibraryService(username) 

288 

289 # Get library settings 

290 from ...utilities.db_utils import get_settings_manager 

291 

292 settings = get_settings_manager() 

293 pdf_storage_mode = settings.get_setting( 

294 "research_library.pdf_storage_mode", "database" 

295 ) 

296 # Enable PDF storage button if mode is not "none" 

297 enable_pdf_storage = pdf_storage_mode != "none" 

298 shared_library = settings.get_setting( 

299 "research_library.shared_library", False 

300 ) 

301 

302 # Get research sessions with statistics 

303 research_list = service.get_research_list_with_stats() 

304 

305 # Calculate summary statistics 

306 total_researches = len(research_list) 

307 total_resources = sum(r["total_resources"] for r in research_list) 

308 already_downloaded = sum(r["downloaded_count"] for r in research_list) 

309 available_to_download = ( 

310 sum(r["downloadable_count"] for r in research_list) - already_downloaded 

311 ) 

312 

313 # Enrich research data with domain breakdowns 

314 for research in research_list: 314 ↛ 316line 314 didn't jump to line 316 because the loop on line 314 never started

315 # Get PDF sources for this research 

316 documents = service.get_documents( 

317 research_id=research["id"], file_type="pdf" 

318 ) 

319 research["pdf_sources"] = documents[:10] # Preview first 10 

320 

321 # Domain statistics 

322 domains = {} 

323 for doc in documents: 

324 domain = doc.get("domain", "unknown") 

325 if domain not in domains: 

326 domains[domain] = {"total": 0, "pdfs": 0, "downloaded": 0} 

327 domains[domain]["total"] += 1 

328 if doc["file_type"] == "pdf": 

329 domains[domain]["pdfs"] += 1 

330 if doc["download_status"] == "completed": 

331 domains[domain]["downloaded"] += 1 

332 

333 research["domains"] = domains 

334 

335 return render_template_with_defaults( 

336 "pages/download_manager.html", 

337 research_list=research_list, 

338 total_researches=total_researches, 

339 total_resources=total_resources, 

340 already_downloaded=already_downloaded, 

341 available_to_download=available_to_download, 

342 enable_pdf_storage=enable_pdf_storage, 

343 pdf_storage_mode=pdf_storage_mode, 

344 shared_library=shared_library, 

345 ) 

346 

347 

348# ============= API Routes ============= 

349 

350 

351@library_bp.route("/api/stats") 

352@login_required 

353def get_library_stats(): 

354 """Get library statistics.""" 

355 username = session.get("username") 

356 service = LibraryService(username) 

357 stats = service.get_library_stats() 

358 return jsonify(stats) 

359 

360 

361@library_bp.route("/api/collections/list") 

362@login_required 

363def get_collections_list(): 

364 """Get list of all collections for dropdown selection.""" 

365 username = session.get("username") 

366 

367 with get_user_db_session(username) as db_session: 

368 collections = ( 

369 db_session.query(Collection).order_by(Collection.name).all() 

370 ) 

371 

372 return jsonify( 

373 { 

374 "success": True, 

375 "collections": [ 

376 { 

377 "id": col.id, 

378 "name": col.name, 

379 "description": col.description, 

380 } 

381 for col in collections 

382 ], 

383 } 

384 ) 

385 

386 

387@library_bp.route("/api/documents") 

388@login_required 

389def get_documents(): 

390 """Get documents with filtering.""" 

391 username = session.get("username") 

392 service = LibraryService(username) 

393 

394 # Get filter parameters 

395 research_id = request.args.get("research_id") 

396 domain = request.args.get("domain") 

397 file_type = request.args.get("file_type") 

398 favorites_only = request.args.get("favorites") == "true" 

399 search_query = request.args.get("search") 

400 limit = int(request.args.get("limit", 100)) 

401 offset = int(request.args.get("offset", 0)) 

402 

403 documents = service.get_documents( 

404 research_id=research_id, 

405 domain=domain, 

406 file_type=file_type, 

407 favorites_only=favorites_only, 

408 search_query=search_query, 

409 limit=limit, 

410 offset=offset, 

411 ) 

412 

413 return jsonify({"documents": documents}) 

414 

415 

416@library_bp.route( 

417 "/api/document/<string:document_id>/favorite", methods=["POST"] 

418) 

419@login_required 

420def toggle_favorite(document_id): 

421 """Toggle favorite status of a document.""" 

422 username = session.get("username") 

423 service = LibraryService(username) 

424 is_favorite = service.toggle_favorite(document_id) 

425 return jsonify({"favorite": is_favorite}) 

426 

427 

428@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"]) 

429@login_required 

430def delete_document(document_id): 

431 """Delete a document from library.""" 

432 username = session.get("username") 

433 service = LibraryService(username) 

434 success = service.delete_document(document_id) 

435 return jsonify({"success": success}) 

436 

437 

438@library_bp.route("/api/document/<string:document_id>/pdf-url") 

439@login_required 

440def get_pdf_url(document_id): 

441 """Get URL for viewing PDF.""" 

442 # Return URL that will serve the PDF 

443 return jsonify( 

444 { 

445 "url": f"/library/api/document/{document_id}/pdf", 

446 "title": "Document", # Could fetch actual title 

447 } 

448 ) 

449 

450 

451@library_bp.route("/document/<string:document_id>/pdf") 

452@login_required 

453def view_pdf_page(document_id): 

454 """Page for viewing PDF file - uses PDFStorageManager for retrieval.""" 

455 username = session.get("username") 

456 

457 with get_user_db_session(username) as db_session: 

458 # Get document from database 

459 document = db_session.query(Document).filter_by(id=document_id).first() 

460 

461 if not document: 

462 logger.warning( 

463 f"Document ID {document_id} not found in database for user {username}" 

464 ) 

465 return "Document not found", 404 

466 

467 logger.info( 

468 f"Document {document_id}: title='{document.title}', " 

469 f"file_path={document.file_path}" 

470 ) 

471 

472 # Get settings for PDF storage manager 

473 settings = get_settings_manager(db_session) 

474 storage_mode = settings.get_setting( 

475 "research_library.pdf_storage_mode", "none" 

476 ) 

477 library_root = Path( 

478 settings.get_setting( 

479 "research_library.storage_path", 

480 str(get_library_directory()), 

481 ) 

482 ).expanduser() 

483 

484 # Use PDFStorageManager to load PDF (handles database and filesystem) 

485 pdf_manager = PDFStorageManager(library_root, storage_mode) 

486 pdf_bytes = pdf_manager.load_pdf(document, db_session) 

487 

488 if pdf_bytes: 

489 logger.info( 

490 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)" 

491 ) 

492 return send_file( 

493 BytesIO(pdf_bytes), 

494 mimetype="application/pdf", 

495 as_attachment=False, 

496 download_name=document.filename or "document.pdf", 

497 ) 

498 

499 # No PDF found anywhere 

500 logger.warning(f"No PDF available for document {document_id}") 

501 return "PDF not available", 404 

502 

503 

504@library_bp.route("/api/document/<string:document_id>/pdf") 

505@login_required 

506def serve_pdf_api(document_id): 

507 """API endpoint for serving PDF file (kept for backward compatibility).""" 

508 return view_pdf_page(document_id) 

509 

510 

511@library_bp.route("/document/<string:document_id>/txt") 

512@login_required 

513def view_text_page(document_id): 

514 """Page for viewing text content.""" 

515 username = session.get("username") 

516 

517 with get_user_db_session(username) as db_session: 

518 # Get document by ID (text now stored in Document.text_content) 

519 document = db_session.query(Document).filter_by(id=document_id).first() 

520 

521 if not document: 

522 logger.warning(f"Document not found for document ID {document_id}") 

523 return "Document not found", 404 

524 

525 if not document.text_content: 

526 logger.warning(f"Document {document_id} has no text content") 

527 return "Text content not available", 404 

528 

529 logger.info( 

530 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

531 ) 

532 

533 # Render as HTML page 

534 return render_template_with_defaults( 

535 "pages/document_text.html", 

536 document_id=document_id, 

537 title=document.title or "Document Text", 

538 text_content=document.text_content, 

539 extraction_method=document.extraction_method, 

540 word_count=document.word_count, 

541 ) 

542 

543 

544@library_bp.route("/api/document/<string:document_id>/text") 

545@login_required 

546def serve_text_api(document_id): 

547 """API endpoint for serving text content (kept for backward compatibility).""" 

548 username = session.get("username") 

549 

550 with get_user_db_session(username) as db_session: 

551 # Get document by ID (text now stored in Document.text_content) 

552 document = db_session.query(Document).filter_by(id=document_id).first() 

553 

554 if not document: 

555 logger.warning(f"Document not found for document ID {document_id}") 

556 return jsonify({"error": "Document not found"}), 404 

557 

558 if not document.text_content: 

559 logger.warning(f"Document {document_id} has no text content") 

560 return jsonify({"error": "Text content not available"}), 404 

561 

562 logger.info( 

563 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

564 ) 

565 

566 return jsonify( 

567 { 

568 "text_content": document.text_content, 

569 "title": document.title or "Document", 

570 "extraction_method": document.extraction_method, 

571 "word_count": document.word_count, 

572 } 

573 ) 

574 

575 

576@library_bp.route("/api/open-folder", methods=["POST"]) 

577@login_required 

578def open_folder(): 

579 """Open folder containing a document. 

580 

581 Security: This endpoint is disabled for server deployments. 

582 It only makes sense for desktop usage where the server and client are on the same machine. 

583 """ 

584 return jsonify( 

585 { 

586 "status": "error", 

587 "message": "This feature is disabled. It is only available in desktop mode.", 

588 } 

589 ), 403 

590 

591 

592@library_bp.route("/api/download/<int:resource_id>", methods=["POST"]) 

593@login_required 

594def download_single_resource(resource_id): 

595 """Download a single resource.""" 

596 username = session.get("username") 

597 user_password = get_authenticated_user_password(username) 

598 

599 with DownloadService(username, user_password) as service: 

600 success, error = service.download_resource(resource_id) 

601 if success: 

602 return jsonify({"success": True}) 

603 else: 

604 logger.warning( 

605 f"Download failed for resource {resource_id}: {error}" 

606 ) 

607 return jsonify( 

608 { 

609 "success": False, 

610 "error": "Download failed. Please try again or contact support.", 

611 } 

612 ), 500 

613 

614 

615@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"]) 

616@login_required 

617def download_text_single(resource_id): 

618 """Download a single resource as text file.""" 

619 try: 

620 username = session.get("username") 

621 user_password = get_authenticated_user_password(username) 

622 

623 with DownloadService(username, user_password) as service: 

624 success, error = service.download_as_text(resource_id) 

625 

626 # Sanitize error message - don't expose internal details 

627 if not success: 

628 if error: 

629 logger.warning( 

630 f"Download as text failed for resource {resource_id}: {error}" 

631 ) 

632 return jsonify( 

633 {"success": False, "error": "Failed to download resource"} 

634 ) 

635 

636 return jsonify({"success": True, "error": None}) 

637 except Exception as e: 

638 return handle_api_error( 

639 f"downloading resource {resource_id} as text", e 

640 ) 

641 

642 

643@library_bp.route("/api/download-all-text", methods=["POST"]) 

644@login_required 

645def download_all_text(): 

646 """Download all undownloaded resources as text files.""" 

647 username = session.get("username") 

648 # Capture Flask session ID to avoid scoping issues in nested function 

649 flask_session_id = session.get("session_id") 

650 

651 def generate(): 

652 # Get user password for database operations 

653 from ...web.exceptions import AuthenticationRequiredError 

654 

655 try: 

656 user_password = get_authenticated_user_password( 

657 username, flask_session_id 

658 ) 

659 except AuthenticationRequiredError: 

660 logger.warning(f"Authentication expired for user {username}") 

661 return 

662 

663 download_service = DownloadService(username, user_password) 

664 try: 

665 # Get all undownloaded resources 

666 with get_user_db_session(username) as session: 

667 # Get resources that don't have text files yet 

668 resources = session.query(ResearchResource).all() 

669 

670 # Filter resources that need text extraction 

671 txt_path = Path(download_service.library_root) / "txt" 

672 resources_to_process = [] 

673 

674 # Pre-scan directory once to get all existing resource IDs 

675 existing_resource_ids = set() 

676 if txt_path.exists(): 

677 for txt_file in txt_path.glob("*.txt"): 

678 # Extract resource ID from filename pattern *_{id}.txt 

679 parts = txt_file.stem.rsplit("_", 1) 

680 if len(parts) == 2: 

681 try: 

682 existing_resource_ids.add(int(parts[1])) 

683 except ValueError: 

684 pass 

685 

686 for resource in resources: 

687 # Check if text file already exists using preloaded set 

688 if resource.id not in existing_resource_ids: 

689 resources_to_process.append(resource) 

690 

691 total = len(resources_to_process) 

692 current = 0 

693 

694 logger.info(f"Found {total} resources needing text extraction") 

695 

696 for resource in resources_to_process: 

697 current += 1 

698 progress = ( 

699 int((current / total) * 100) if total > 0 else 100 

700 ) 

701 

702 file_name = ( 

703 resource.title[:50] 

704 if resource 

705 else f"document_{current}.txt" 

706 ) 

707 

708 try: 

709 success, error = download_service.download_as_text( 

710 resource.id 

711 ) 

712 

713 if success: 

714 status = "success" 

715 error_msg = None 

716 else: 

717 status = "failed" 

718 error_msg = error or "Text extraction failed" 

719 

720 except Exception as e: 

721 logger.exception( 

722 f"Error extracting text for resource {resource.id}" 

723 ) 

724 status = "failed" 

725 error_msg = ( 

726 f"Text extraction failed - {type(e).__name__}" 

727 ) 

728 

729 # Send update 

730 update = { 

731 "progress": progress, 

732 "current": current, 

733 "total": total, 

734 "file": file_name, 

735 "url": resource.url, # Add the URL for UI display 

736 "status": status, 

737 "error": error_msg, 

738 } 

739 yield f"data: {json.dumps(update)}\n\n" 

740 

741 # Send completion 

742 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n" 

743 finally: 

744 download_service.close() 

745 

746 return Response( 

747 stream_with_context(generate()), mimetype="text/event-stream" 

748 ) 

749 

750 

751@library_bp.route("/api/download-research/<research_id>", methods=["POST"]) 

752@login_required 

753def download_research_pdfs(research_id): 

754 """Queue all PDFs from a research session for download.""" 

755 username = session.get("username") 

756 user_password = get_authenticated_user_password(username) 

757 

758 with DownloadService(username, user_password) as service: 

759 # Get optional collection_id from request body 

760 data = request.json or {} 

761 collection_id = data.get("collection_id") 

762 

763 queued = service.queue_research_downloads(research_id, collection_id) 

764 

765 # Start processing queue (in production, this would be a background task) 

766 # For now, we'll process synchronously 

767 # TODO: Integrate with existing queue processor 

768 

769 return jsonify({"success": True, "queued": queued}) 

770 

771 

772@library_bp.route("/api/download-bulk", methods=["POST"]) 

773@login_required 

774def download_bulk(): 

775 """Download PDFs or extract text from multiple research sessions.""" 

776 username = session.get("username") 

777 data = request.json 

778 research_ids = data.get("research_ids", []) 

779 mode = data.get("mode", "pdf") # pdf or text_only 

780 collection_id = data.get( 

781 "collection_id" 

782 ) # Optional: target collection for downloads 

783 

784 if not research_ids: 

785 return jsonify({"error": "No research IDs provided"}), 400 

786 

787 # Capture Flask session ID to avoid scoping issues in nested function 

788 flask_session_id = session.get("session_id") 

789 

790 def generate(): 

791 """Generate progress updates as Server-Sent Events.""" 

792 # Get user password for database operations 

793 from ...web.exceptions import AuthenticationRequiredError 

794 

795 try: 

796 user_password = get_authenticated_user_password( 

797 username, flask_session_id 

798 ) 

799 except AuthenticationRequiredError: 

800 return 

801 

802 download_service = DownloadService(username, user_password) 

803 try: 

804 # Count total pending queue items across all research IDs 

805 total = 0 

806 current = 0 

807 

808 with get_user_db_session(username) as session: 

809 for research_id in research_ids: 

810 count = ( 

811 session.query(LibraryDownloadQueue) 

812 .filter_by( 

813 research_id=research_id, 

814 status=DocumentStatus.PENDING, 

815 ) 

816 .count() 

817 ) 

818 total += count 

819 logger.debug( 

820 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue" 

821 ) 

822 

823 logger.info( 

824 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}" 

825 ) 

826 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n" 

827 

828 # Process each research 

829 for research_id in research_ids: 

830 # Get queued downloads for this research 

831 with get_user_db_session(username) as session: 

832 # Get pending queue items for this research 

833 queue_items = ( 

834 session.query(LibraryDownloadQueue) 

835 .filter_by( 

836 research_id=research_id, 

837 status=DocumentStatus.PENDING, 

838 ) 

839 .all() 

840 ) 

841 

842 # If no items queued yet, queue them now 

843 if not queue_items: 

844 try: 

845 download_service.queue_research_downloads( 

846 research_id, collection_id 

847 ) 

848 # Re-fetch queue items 

849 queue_items = ( 

850 session.query(LibraryDownloadQueue) 

851 .filter_by( 

852 research_id=research_id, status="pending" 

853 ) 

854 .all() 

855 ) 

856 except Exception: 

857 logger.exception( 

858 f"Error queueing downloads for research {research_id}" 

859 ) 

860 # Continue with empty queue_items 

861 queue_items = [] 

862 

863 # Process each queued item 

864 for queue_item in queue_items: 

865 logger.debug( 

866 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})" 

867 ) 

868 current += 1 

869 logger.debug( 

870 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})" 

871 ) 

872 

873 # Check for division issues 

874 if total is None: 

875 logger.error( 

876 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash" 

877 ) 

878 total = 0 

879 

880 progress = ( 

881 int((current / total) * 100) if total > 0 else 100 

882 ) 

883 logger.debug( 

884 f"[PROGRESS_DEBUG] Calculated progress: {progress}%" 

885 ) 

886 

887 # Get resource info 

888 resource = session.query(ResearchResource).get( 

889 queue_item.resource_id 

890 ) 

891 file_name = ( 

892 resource.title[:50] 

893 if resource 

894 else f"document_{current}.pdf" 

895 ) 

896 

897 # Attempt actual download with error handling 

898 skip_reason = None 

899 status = "skipped" # Default to skipped 

900 success = False 

901 error_msg = None 

902 

903 try: 

904 logger.debug( 

905 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}" 

906 ) 

907 

908 # Call appropriate service method based on mode 

909 if mode == "pdf": 

910 result = download_service.download_resource( 

911 queue_item.resource_id 

912 ) 

913 else: # text_only 

914 result = download_service.download_as_text( 

915 queue_item.resource_id 

916 ) 

917 

918 # Handle new tuple return format 

919 if isinstance(result, tuple): 

920 success, skip_reason = result 

921 else: 

922 success = result 

923 skip_reason = None 

924 

925 status = "success" if success else "skipped" 

926 if skip_reason and not success: 

927 error_msg = skip_reason 

928 logger.info( 

929 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}" 

930 ) 

931 

932 logger.debug( 

933 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}" 

934 ) 

935 except Exception as e: 

936 # Log error but continue processing 

937 error_msg = str(e) 

938 error_type = type(e).__name__ 

939 logger.info( 

940 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}" 

941 ) 

942 # Check if this is a skip reason (not a real error) 

943 # Use error category + categorized message for user display 

944 if any( 

945 phrase in error_msg.lower() 

946 for phrase in [ 

947 "paywall", 

948 "subscription", 

949 "not available", 

950 "not found", 

951 "no free", 

952 "embargoed", 

953 "forbidden", 

954 "not accessible", 

955 ] 

956 ): 

957 status = "skipped" 

958 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}" 

959 elif any( 

960 phrase in error_msg.lower() 

961 for phrase in [ 

962 "failed to download", 

963 "could not", 

964 "invalid", 

965 "server", 

966 ] 

967 ): 

968 status = "failed" 

969 skip_reason = f"Download failed - {error_type}" 

970 else: 

971 status = "failed" 

972 skip_reason = ( 

973 f"Processing failed - {error_type}" 

974 ) 

975 success = False 

976 

977 # Ensure skip_reason is set if we have an error message 

978 if error_msg and not skip_reason: 

979 skip_reason = f"Processing failed - {error_type}" 

980 logger.debug( 

981 f"Setting skip_reason from error_msg: {error_msg}" 

982 ) 

983 

984 # Send progress update 

985 update_data = { 

986 "progress": progress, 

987 "current": current, 

988 "total": total, 

989 "file": file_name, 

990 "status": status, 

991 } 

992 # Add skip reason if available 

993 if skip_reason: 

994 update_data["error"] = skip_reason 

995 logger.info( 

996 f"Sending skip reason to UI: {skip_reason}" 

997 ) 

998 

999 logger.info(f"Update data being sent: {update_data}") 

1000 yield f"data: {json.dumps(update_data)}\n\n" 

1001 

1002 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n" 

1003 finally: 

1004 download_service.close() 

1005 

1006 return Response( 

1007 stream_with_context(generate()), 

1008 mimetype="text/event-stream", 

1009 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, 

1010 ) 

1011 

1012 

1013@library_bp.route("/api/research-list") 

1014@login_required 

1015def get_research_list(): 

1016 """Get list of research sessions with download stats.""" 

1017 username = session.get("username") 

1018 service = LibraryService(username) 

1019 research_list = service.get_research_list_with_stats() 

1020 return jsonify({"research": research_list}) 

1021 

1022 

1023@library_bp.route("/api/sync-library", methods=["POST"]) 

1024@login_required 

1025def sync_library(): 

1026 """Sync library database with filesystem.""" 

1027 username = session.get("username") 

1028 service = LibraryService(username) 

1029 stats = service.sync_library_with_filesystem() 

1030 return jsonify(stats) 

1031 

1032 

1033@library_bp.route("/api/mark-redownload", methods=["POST"]) 

1034@login_required 

1035def mark_for_redownload(): 

1036 """Mark documents for re-download.""" 

1037 username = session.get("username") 

1038 service = LibraryService(username) 

1039 

1040 data = request.json 

1041 document_ids = data.get("document_ids", []) 

1042 

1043 if not document_ids: 

1044 return jsonify({"error": "No document IDs provided"}), 400 

1045 

1046 count = service.mark_for_redownload(document_ids) 

1047 return jsonify({"success": True, "marked": count}) 

1048 

1049 

1050@library_bp.route("/api/queue-all-undownloaded", methods=["POST"]) 

1051@login_required 

1052def queue_all_undownloaded(): 

1053 """Queue all articles that haven't been downloaded yet.""" 

1054 username = session.get("username") 

1055 

1056 logger.info(f"queue_all_undownloaded called for user {username}") 

1057 

1058 with get_user_db_session(username) as db_session: 

1059 # Find all resources that don't have a completed download 

1060 undownloaded = ( 

1061 db_session.query(ResearchResource) 

1062 .outerjoin( 

1063 Document, 

1064 (ResearchResource.id == Document.resource_id) 

1065 & (Document.status == "completed"), 

1066 ) 

1067 .filter(Document.id.is_(None)) 

1068 .all() 

1069 ) 

1070 

1071 logger.info(f"Found {len(undownloaded)} total undownloaded resources") 

1072 

1073 # Get user password for encrypted database access 

1074 user_password = get_authenticated_user_password(username) 

1075 

1076 resource_filter = ResourceFilter(username, user_password) 

1077 filter_results = resource_filter.filter_downloadable_resources( 

1078 undownloaded 

1079 ) 

1080 

1081 # Get detailed filtering summary 

1082 filter_summary = resource_filter.get_filter_summary(undownloaded) 

1083 skipped_info = resource_filter.get_skipped_resources_info(undownloaded) 

1084 

1085 logger.info(f"Filter results: {filter_summary.to_dict()}") 

1086 

1087 queued_count = 0 

1088 research_ids = set() 

1089 skipped_count = 0 

1090 

1091 # Convert filter_results to dict for O(1) lookup instead of O(n²) 

1092 filter_results_by_id = {r.resource_id: r for r in filter_results} 

1093 

1094 for resource in undownloaded: 

1095 # Check if resource passed the smart filter 

1096 filter_result = filter_results_by_id.get(resource.id) 

1097 

1098 if not filter_result or not filter_result.can_retry: 

1099 skipped_count += 1 

1100 if filter_result: 

1101 logger.debug( 

1102 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}" 

1103 ) 

1104 else: 

1105 logger.debug( 

1106 f"Skipping resource {resource.id} - no filter result available" 

1107 ) 

1108 continue 

1109 

1110 # Check if it's downloadable using proper URL parsing 

1111 if not resource.url: 

1112 skipped_count += 1 

1113 continue 

1114 

1115 is_downloadable = is_downloadable_domain(resource.url) 

1116 

1117 # Log what we're checking 

1118 if resource.url and "pubmed" in resource.url.lower(): 

1119 logger.info(f"Found PubMed URL: {resource.url[:100]}") 

1120 

1121 if not is_downloadable: 

1122 skipped_count += 1 

1123 logger.debug( 

1124 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}" 

1125 ) 

1126 continue 

1127 

1128 # Check if already in queue (any status) 

1129 existing_queue = ( 

1130 db_session.query(LibraryDownloadQueue) 

1131 .filter_by(resource_id=resource.id) 

1132 .first() 

1133 ) 

1134 

1135 if existing_queue: 

1136 # If it exists but isn't pending, reset it to pending 

1137 if existing_queue.status != DocumentStatus.PENDING: 

1138 existing_queue.status = DocumentStatus.PENDING 

1139 existing_queue.completed_at = None 

1140 queued_count += 1 

1141 research_ids.add(resource.research_id) 

1142 logger.debug( 

1143 f"Reset queue entry for resource {resource.id} to pending" 

1144 ) 

1145 else: 

1146 # Already pending, still count it 

1147 queued_count += 1 

1148 research_ids.add(resource.research_id) 

1149 logger.debug( 

1150 f"Resource {resource.id} already pending in queue" 

1151 ) 

1152 else: 

1153 # Add new entry to queue 

1154 queue_entry = LibraryDownloadQueue( 

1155 resource_id=resource.id, 

1156 research_id=resource.research_id, 

1157 priority=0, 

1158 status=DocumentStatus.PENDING, 

1159 ) 

1160 db_session.add(queue_entry) 

1161 queued_count += 1 

1162 research_ids.add(resource.research_id) 

1163 logger.debug( 

1164 f"Added new queue entry for resource {resource.id}" 

1165 ) 

1166 

1167 db_session.commit() 

1168 

1169 logger.info( 

1170 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)" 

1171 ) 

1172 

1173 # Note: Removed synchronous download processing here to avoid blocking the HTTP request 

1174 # Downloads will be processed via the SSE streaming endpoint or background tasks 

1175 

1176 return jsonify( 

1177 { 

1178 "success": True, 

1179 "queued": queued_count, 

1180 "research_ids": list(research_ids), 

1181 "total_undownloaded": len(undownloaded), 

1182 "skipped": skipped_count, 

1183 "filter_summary": filter_summary.to_dict(), 

1184 "skipped_details": skipped_info, 

1185 } 

1186 ) 

1187 

1188 

1189@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"]) 

1190@login_required 

1191def get_research_sources(research_id): 

1192 """Get all sources for a research with snippets.""" 

1193 username = session.get("username") 

1194 

1195 sources = [] 

1196 with get_user_db_session(username) as db_session: 

1197 # Get all resources for this research 

1198 resources = ( 

1199 db_session.query(ResearchResource) 

1200 .filter_by(research_id=research_id) 

1201 .order_by(ResearchResource.created_at) 

1202 .all() 

1203 ) 

1204 

1205 for idx, resource in enumerate(resources, 1): 1205 ↛ 1207line 1205 didn't jump to line 1207 because the loop on line 1205 never started

1206 # Check if document exists 

1207 document = ( 

1208 db_session.query(Document) 

1209 .filter_by(resource_id=resource.id) 

1210 .first() 

1211 ) 

1212 

1213 # Get domain from URL 

1214 domain = "" 

1215 if resource.url: 

1216 try: 

1217 from urllib.parse import urlparse 

1218 

1219 domain = urlparse(resource.url).hostname or "" 

1220 except (ValueError, AttributeError): 

1221 # urlparse can raise ValueError for malformed URLs 

1222 pass 

1223 

1224 source_data = { 

1225 "number": idx, 

1226 "resource_id": resource.id, 

1227 "url": resource.url, 

1228 "title": resource.title or f"Source {idx}", 

1229 "snippet": resource.content_preview or "", 

1230 "domain": domain, 

1231 "relevance_score": getattr(resource, "relevance_score", None), 

1232 "downloaded": False, 

1233 "document_id": None, 

1234 "file_type": None, 

1235 } 

1236 

1237 if document and document.status == "completed": 

1238 source_data.update( 

1239 { 

1240 "downloaded": True, 

1241 "document_id": document.id, 

1242 "file_type": document.file_type, 

1243 "download_date": document.created_at.isoformat() 

1244 if document.created_at 

1245 else None, 

1246 } 

1247 ) 

1248 

1249 sources.append(source_data) 

1250 

1251 return jsonify({"success": True, "sources": sources, "total": len(sources)}) 

1252 

1253 

1254@library_bp.route("/api/check-downloads", methods=["POST"]) 

1255@login_required 

1256def check_downloads(): 

1257 """Check download status for a list of URLs.""" 

1258 username = session.get("username") 

1259 data = request.json 

1260 research_id = data.get("research_id") 

1261 urls = data.get("urls", []) 

1262 

1263 if not research_id or not urls: 1263 ↛ 1266line 1263 didn't jump to line 1266 because the condition on line 1263 was always true

1264 return jsonify({"error": "Missing research_id or urls"}), 400 

1265 

1266 download_status = {} 

1267 

1268 with get_user_db_session(username) as db_session: 

1269 # Get all resources for this research 

1270 resources = ( 

1271 db_session.query(ResearchResource) 

1272 .filter_by(research_id=research_id) 

1273 .filter(ResearchResource.url.in_(urls)) 

1274 .all() 

1275 ) 

1276 

1277 for resource in resources: 

1278 # Check if document exists 

1279 document = ( 

1280 db_session.query(Document) 

1281 .filter_by(resource_id=resource.id) 

1282 .first() 

1283 ) 

1284 

1285 if document and document.status == "completed": 

1286 download_status[resource.url] = { 

1287 "downloaded": True, 

1288 "document_id": document.id, 

1289 "file_path": document.file_path, 

1290 "file_type": document.file_type, 

1291 "title": document.title or resource.title, 

1292 } 

1293 else: 

1294 download_status[resource.url] = { 

1295 "downloaded": False, 

1296 "resource_id": resource.id, 

1297 } 

1298 

1299 return jsonify({"download_status": download_status}) 

1300 

1301 

1302@library_bp.route("/api/download-source", methods=["POST"]) 

1303@login_required 

1304def download_source(): 

1305 """Download a single source from a research.""" 

1306 username = session.get("username") 

1307 user_password = get_authenticated_user_password(username) 

1308 data = request.json 

1309 research_id = data.get("research_id") 

1310 url = data.get("url") 

1311 

1312 if not research_id or not url: 1312 ↛ 1316line 1312 didn't jump to line 1316 because the condition on line 1312 was always true

1313 return jsonify({"error": "Missing research_id or url"}), 400 

1314 

1315 # Check if URL is downloadable 

1316 if not is_downloadable_domain(url): 

1317 return jsonify({"error": "URL is not from a downloadable domain"}), 400 

1318 

1319 with get_user_db_session(username) as db_session: 

1320 # Find the resource 

1321 resource = ( 

1322 db_session.query(ResearchResource) 

1323 .filter_by(research_id=research_id, url=url) 

1324 .first() 

1325 ) 

1326 

1327 if not resource: 

1328 return jsonify({"error": "Resource not found"}), 404 

1329 

1330 # Check if already downloaded 

1331 existing = ( 

1332 db_session.query(Document) 

1333 .filter_by(resource_id=resource.id) 

1334 .first() 

1335 ) 

1336 

1337 if existing and existing.download_status == "completed": 

1338 return jsonify( 

1339 { 

1340 "success": True, 

1341 "message": "Already downloaded", 

1342 "document_id": existing.id, 

1343 } 

1344 ) 

1345 

1346 # Add to download queue 

1347 queue_entry = ( 

1348 db_session.query(LibraryDownloadQueue) 

1349 .filter_by(resource_id=resource.id) 

1350 .first() 

1351 ) 

1352 

1353 if not queue_entry: 

1354 queue_entry = LibraryDownloadQueue( 

1355 resource_id=resource.id, 

1356 research_id=resource.research_id, 

1357 priority=1, # Higher priority for manual downloads 

1358 status=DocumentStatus.PENDING, 

1359 ) 

1360 db_session.add(queue_entry) 

1361 else: 

1362 queue_entry.status = DocumentStatus.PENDING 

1363 queue_entry.priority = 1 

1364 

1365 db_session.commit() 

1366 

1367 # Start download immediately 

1368 with DownloadService(username, user_password) as service: 

1369 success, message = service.download_resource(resource.id) 

1370 

1371 if success: 

1372 return jsonify( 

1373 {"success": True, "message": "Download completed"} 

1374 ) 

1375 else: 

1376 # Log internal message, but show only generic message to user 

1377 return jsonify({"success": False, "message": "Download failed"})