Coverage for src / local_deep_research / research_library / routes / library_routes.py: 14%

562 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Routes for Research Library and Download Manager 

3 

4Provides web endpoints for: 

5- Library browsing and management 

6- Download manager interface 

7- API endpoints for downloads and queries 

8""" 

9 

10import json 

11from io import BytesIO 

12from pathlib import Path 

13from urllib.parse import urlparse 

14from flask import ( 

15 Blueprint, 

16 g, 

17 jsonify, 

18 request, 

19 session, 

20 Response, 

21 send_file, 

22 stream_with_context, 

23) 

24from loguru import logger 

25 

26from ...web.auth.decorators import login_required 

27from ...web.utils.templates import render_template_with_defaults 

28from ...database.session_context import get_user_db_session 

29from ...database.models.research import ResearchResource 

30from ...database.models.library import ( 

31 Document as Document, 

32 DocumentStatus, 

33 DownloadQueue as LibraryDownloadQueue, 

34 Collection, 

35) 

36from ...library.download_management import ResourceFilter 

37from ..services.download_service import DownloadService 

38from ..services.library_service import LibraryService 

39from ..services.pdf_storage_manager import PDFStorageManager 

40from ..utils import open_file_location, handle_api_error 

41from ...security.path_validator import PathValidator 

42from ...utilities.db_utils import get_settings_manager 

43from ...config.paths import get_library_directory 

44 

45# Create Blueprint 

46library_bp = Blueprint("library", __name__, url_prefix="/library") 

47 

48 

49def is_downloadable_domain(url: str) -> bool: 

50 """Check if URL is from a downloadable academic domain using proper URL parsing.""" 

51 try: 

52 if not url: 

53 return False 

54 

55 parsed = urlparse(url.lower()) 

56 hostname = parsed.hostname or "" 

57 path = parsed.path or "" 

58 query = parsed.query or "" 

59 

60 # Check for direct PDF files 

61 if path.endswith(".pdf") or ".pdf?" in url.lower(): 

62 return True 

63 

64 # List of downloadable academic domains 

65 downloadable_domains = [ 

66 "arxiv.org", 

67 "biorxiv.org", 

68 "medrxiv.org", 

69 "ncbi.nlm.nih.gov", 

70 "pubmed.ncbi.nlm.nih.gov", 

71 "europepmc.org", 

72 "semanticscholar.org", 

73 "researchgate.net", 

74 "academia.edu", 

75 "sciencedirect.com", 

76 "springer.com", 

77 "nature.com", 

78 "wiley.com", 

79 "ieee.org", 

80 "acm.org", 

81 "plos.org", 

82 "frontiersin.org", 

83 "mdpi.com", 

84 "acs.org", 

85 "rsc.org", 

86 "tandfonline.com", 

87 "sagepub.com", 

88 "oxford.com", 

89 "cambridge.org", 

90 "bmj.com", 

91 "nejm.org", 

92 "thelancet.com", 

93 "jamanetwork.com", 

94 "annals.org", 

95 "ahajournals.org", 

96 "cell.com", 

97 "science.org", 

98 "pnas.org", 

99 "elifesciences.org", 

100 "embopress.org", 

101 "journals.asm.org", 

102 "microbiologyresearch.org", 

103 "jvi.asm.org", 

104 "genome.cshlp.org", 

105 "genetics.org", 

106 "g3journal.org", 

107 "plantphysiol.org", 

108 "plantcell.org", 

109 "aspb.org", 

110 "bioone.org", 

111 "company-of-biologists.org", 

112 "biologists.org", 

113 "jeb.biologists.org", 

114 "dmm.biologists.org", 

115 "bio.biologists.org", 

116 "doi.org", 

117 ] 

118 

119 # Check if hostname matches any downloadable domain 

120 for domain in downloadable_domains: 

121 if hostname == domain or hostname.endswith("." + domain): 

122 return True 

123 

124 # Special case for PubMed which might appear in path 

125 if "pubmed" in hostname or "/pubmed/" in path: 

126 return True 

127 

128 # Check for PDF in path or query parameters 

129 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query: 

130 return True 

131 

132 return False 

133 

134 except Exception as e: 

135 logger.warning(f"Error parsing URL {url}: {e}") 

136 return False 

137 

138 

139def get_authenticated_user_password( 

140 username: str, flask_session_id: str = None 

141): 

142 """ 

143 Get authenticated user password from session store with fallback to g.user_password. 

144 

145 Args: 

146 username: The username to get password for 

147 flask_session_id: Optional Flask session ID. If not provided, uses session.get("_id") 

148 

149 Returns: 

150 tuple: (password, error_response) where error_response is None on success, 

151 or a Flask response tuple on failure 

152 """ 

153 from ...database.session_passwords import session_password_store 

154 

155 session_id = flask_session_id or session.get("session_id") 

156 

157 # Try session password store first 

158 try: 

159 user_password = session_password_store.get_session_password( 

160 username, session_id 

161 ) 

162 if user_password: 

163 logger.debug( 

164 f"Retrieved user password from session store for user {username}" 

165 ) 

166 return user_password, None 

167 except Exception: 

168 logger.exception("Failed to get user password from session store") 

169 

170 # Fallback to g.user_password (set by middleware if temp_auth was used) 

171 user_password = getattr(g, "user_password", None) 

172 if user_password: 

173 logger.debug( 

174 f"Retrieved user password from g.user_password fallback for user {username}" 

175 ) 

176 return user_password, None 

177 

178 # No password available 

179 logger.error(f"No user password available for user {username}") 

180 error_response = ( 

181 jsonify( 

182 { 

183 "status": "error", 

184 "message": "Authentication required: Please refresh the page and log in again to access encrypted database features.", 

185 } 

186 ), 

187 401, 

188 ) 

189 return None, error_response 

190 

191 

192# ============= Page Routes ============= 

193 

194 

195@library_bp.route("/") 

196@login_required 

197def library_page(): 

198 """Main library page showing downloaded documents.""" 

199 username = session.get("username") 

200 service = LibraryService(username) 

201 

202 # Get library settings 

203 from ...utilities.db_utils import get_settings_manager 

204 

205 settings = get_settings_manager() 

206 pdf_storage_mode = settings.get_setting( 

207 "research_library.pdf_storage_mode", "database" 

208 ) 

209 # Enable PDF storage button if mode is not "none" 

210 enable_pdf_storage = pdf_storage_mode != "none" 

211 shared_library = settings.get_setting( 

212 "research_library.shared_library", False 

213 ) 

214 

215 # Get statistics 

216 stats = service.get_library_stats() 

217 

218 # Get documents with optional filters 

219 domain_filter = request.args.get("domain") 

220 research_filter = request.args.get("research") 

221 collection_filter = request.args.get("collection") # New collection filter 

222 

223 documents = service.get_documents( 

224 research_id=research_filter, 

225 domain=domain_filter, 

226 collection_id=collection_filter, 

227 limit=100, 

228 ) 

229 

230 # Get unique domains for filter dropdown 

231 unique_domains = service.get_unique_domains() 

232 

233 # Get research list for filter dropdown 

234 research_list = service.get_research_list_with_stats() 

235 

236 # Get collections list for filter dropdown 

237 collections = service.get_all_collections() 

238 

239 return render_template_with_defaults( 

240 "pages/library.html", 

241 stats=stats, 

242 documents=documents, 

243 unique_domains=unique_domains, 

244 research_list=research_list, 

245 collections=collections, 

246 selected_collection=collection_filter, 

247 storage_path=stats.get("storage_path", ""), 

248 enable_pdf_storage=enable_pdf_storage, 

249 pdf_storage_mode=pdf_storage_mode, 

250 shared_library=shared_library, 

251 ) 

252 

253 

254@library_bp.route("/document/<string:document_id>") 

255@login_required 

256def document_details_page(document_id): 

257 """Document details page showing all metadata and links.""" 

258 username = session.get("username") 

259 service = LibraryService(username) 

260 

261 # Get document details 

262 document = service.get_document_by_id(document_id) 

263 

264 if not document: 

265 return "Document not found", 404 

266 

267 return render_template_with_defaults( 

268 "pages/document_details.html", document=document 

269 ) 

270 

271 

272@library_bp.route("/download-manager") 

273@login_required 

274def download_manager_page(): 

275 """Download manager page for selecting and downloading research PDFs.""" 

276 username = session.get("username") 

277 service = LibraryService(username) 

278 

279 # Get library settings 

280 from ...utilities.db_utils import get_settings_manager 

281 

282 settings = get_settings_manager() 

283 pdf_storage_mode = settings.get_setting( 

284 "research_library.pdf_storage_mode", "database" 

285 ) 

286 # Enable PDF storage button if mode is not "none" 

287 enable_pdf_storage = pdf_storage_mode != "none" 

288 shared_library = settings.get_setting( 

289 "research_library.shared_library", False 

290 ) 

291 

292 # Get research sessions with statistics 

293 research_list = service.get_research_list_with_stats() 

294 

295 # Calculate summary statistics 

296 total_researches = len(research_list) 

297 total_resources = sum(r["total_resources"] for r in research_list) 

298 already_downloaded = sum(r["downloaded_count"] for r in research_list) 

299 available_to_download = ( 

300 sum(r["downloadable_count"] for r in research_list) - already_downloaded 

301 ) 

302 

303 # Enrich research data with domain breakdowns 

304 for research in research_list: 

305 # Get PDF sources for this research 

306 documents = service.get_documents( 

307 research_id=research["id"], file_type="pdf" 

308 ) 

309 research["pdf_sources"] = documents[:10] # Preview first 10 

310 

311 # Domain statistics 

312 domains = {} 

313 for doc in documents: 

314 domain = doc.get("domain", "unknown") 

315 if domain not in domains: 

316 domains[domain] = {"total": 0, "pdfs": 0, "downloaded": 0} 

317 domains[domain]["total"] += 1 

318 if doc["file_type"] == "pdf": 

319 domains[domain]["pdfs"] += 1 

320 if doc["download_status"] == "completed": 

321 domains[domain]["downloaded"] += 1 

322 

323 research["domains"] = domains 

324 

325 return render_template_with_defaults( 

326 "pages/download_manager.html", 

327 research_list=research_list, 

328 total_researches=total_researches, 

329 total_resources=total_resources, 

330 already_downloaded=already_downloaded, 

331 available_to_download=available_to_download, 

332 enable_pdf_storage=enable_pdf_storage, 

333 pdf_storage_mode=pdf_storage_mode, 

334 shared_library=shared_library, 

335 ) 

336 

337 

338# ============= API Routes ============= 

339 

340 

341@library_bp.route("/api/stats") 

342@login_required 

343def get_library_stats(): 

344 """Get library statistics.""" 

345 username = session.get("username") 

346 service = LibraryService(username) 

347 stats = service.get_library_stats() 

348 return jsonify(stats) 

349 

350 

351@library_bp.route("/api/collections/list") 

352@login_required 

353def get_collections_list(): 

354 """Get list of all collections for dropdown selection.""" 

355 username = session.get("username") 

356 

357 with get_user_db_session(username) as db_session: 

358 collections = ( 

359 db_session.query(Collection).order_by(Collection.name).all() 

360 ) 

361 

362 return jsonify( 

363 { 

364 "success": True, 

365 "collections": [ 

366 { 

367 "id": col.id, 

368 "name": col.name, 

369 "description": col.description, 

370 } 

371 for col in collections 

372 ], 

373 } 

374 ) 

375 

376 

377@library_bp.route("/api/documents") 

378@login_required 

379def get_documents(): 

380 """Get documents with filtering.""" 

381 username = session.get("username") 

382 service = LibraryService(username) 

383 

384 # Get filter parameters 

385 research_id = request.args.get("research_id") 

386 domain = request.args.get("domain") 

387 file_type = request.args.get("file_type") 

388 favorites_only = request.args.get("favorites") == "true" 

389 search_query = request.args.get("search") 

390 limit = int(request.args.get("limit", 100)) 

391 offset = int(request.args.get("offset", 0)) 

392 

393 documents = service.get_documents( 

394 research_id=research_id, 

395 domain=domain, 

396 file_type=file_type, 

397 favorites_only=favorites_only, 

398 search_query=search_query, 

399 limit=limit, 

400 offset=offset, 

401 ) 

402 

403 return jsonify({"documents": documents}) 

404 

405 

406@library_bp.route( 

407 "/api/document/<string:document_id>/favorite", methods=["POST"] 

408) 

409@login_required 

410def toggle_favorite(document_id): 

411 """Toggle favorite status of a document.""" 

412 username = session.get("username") 

413 service = LibraryService(username) 

414 is_favorite = service.toggle_favorite(document_id) 

415 return jsonify({"favorite": is_favorite}) 

416 

417 

418@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"]) 

419@login_required 

420def delete_document(document_id): 

421 """Delete a document from library.""" 

422 username = session.get("username") 

423 service = LibraryService(username) 

424 success = service.delete_document(document_id) 

425 return jsonify({"success": success}) 

426 

427 

428@library_bp.route("/api/document/<string:document_id>/pdf-url") 

429@login_required 

430def get_pdf_url(document_id): 

431 """Get URL for viewing PDF.""" 

432 # Return URL that will serve the PDF 

433 return jsonify( 

434 { 

435 "url": f"/library/api/document/{document_id}/pdf", 

436 "title": "Document", # Could fetch actual title 

437 } 

438 ) 

439 

440 

441@library_bp.route("/document/<string:document_id>/pdf") 

442@login_required 

443def view_pdf_page(document_id): 

444 """Page for viewing PDF file - uses PDFStorageManager for retrieval.""" 

445 username = session.get("username") 

446 

447 with get_user_db_session(username) as db_session: 

448 # Get document from database 

449 document = db_session.query(Document).filter_by(id=document_id).first() 

450 

451 if not document: 

452 logger.warning( 

453 f"Document ID {document_id} not found in database for user {username}" 

454 ) 

455 return "Document not found", 404 

456 

457 logger.info( 

458 f"Document {document_id}: title='{document.title}', " 

459 f"file_path={document.file_path}" 

460 ) 

461 

462 # Get settings for PDF storage manager 

463 settings = get_settings_manager(db_session) 

464 storage_mode = settings.get_setting( 

465 "research_library.pdf_storage_mode", "none" 

466 ) 

467 library_root = Path( 

468 settings.get_setting( 

469 "research_library.storage_path", 

470 str(get_library_directory()), 

471 ) 

472 ).expanduser() 

473 

474 # Use PDFStorageManager to load PDF (handles database and filesystem) 

475 pdf_manager = PDFStorageManager(library_root, storage_mode) 

476 pdf_bytes = pdf_manager.load_pdf(document, db_session) 

477 

478 if pdf_bytes: 

479 logger.info( 

480 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)" 

481 ) 

482 return send_file( 

483 BytesIO(pdf_bytes), 

484 mimetype="application/pdf", 

485 as_attachment=False, 

486 download_name=document.filename or "document.pdf", 

487 ) 

488 

489 # No PDF found anywhere 

490 logger.warning(f"No PDF available for document {document_id}") 

491 return "PDF not available", 404 

492 

493 

494@library_bp.route("/api/document/<string:document_id>/pdf") 

495@login_required 

496def serve_pdf_api(document_id): 

497 """API endpoint for serving PDF file (kept for backward compatibility).""" 

498 return view_pdf_page(document_id) 

499 

500 

501@library_bp.route("/document/<string:document_id>/txt") 

502@login_required 

503def view_text_page(document_id): 

504 """Page for viewing text content.""" 

505 username = session.get("username") 

506 

507 with get_user_db_session(username) as db_session: 

508 # Get document by ID (text now stored in Document.text_content) 

509 document = db_session.query(Document).filter_by(id=document_id).first() 

510 

511 if not document: 

512 logger.warning(f"Document not found for document ID {document_id}") 

513 return "Document not found", 404 

514 

515 if not document.text_content: 

516 logger.warning(f"Document {document_id} has no text content") 

517 return "Text content not available", 404 

518 

519 logger.info( 

520 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

521 ) 

522 

523 # Render as HTML page 

524 return render_template_with_defaults( 

525 "pages/document_text.html", 

526 document_id=document_id, 

527 title=document.title or "Document Text", 

528 text_content=document.text_content, 

529 extraction_method=document.extraction_method, 

530 word_count=document.word_count, 

531 ) 

532 

533 

534@library_bp.route("/api/document/<string:document_id>/text") 

535@login_required 

536def serve_text_api(document_id): 

537 """API endpoint for serving text content (kept for backward compatibility).""" 

538 username = session.get("username") 

539 

540 with get_user_db_session(username) as db_session: 

541 # Get document by ID (text now stored in Document.text_content) 

542 document = db_session.query(Document).filter_by(id=document_id).first() 

543 

544 if not document: 

545 logger.warning(f"Document not found for document ID {document_id}") 

546 return jsonify({"error": "Document not found"}), 404 

547 

548 if not document.text_content: 

549 logger.warning(f"Document {document_id} has no text content") 

550 return jsonify({"error": "Text content not available"}), 404 

551 

552 logger.info( 

553 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

554 ) 

555 

556 return jsonify( 

557 { 

558 "text_content": document.text_content, 

559 "title": document.title or "Document", 

560 "extraction_method": document.extraction_method, 

561 "word_count": document.word_count, 

562 } 

563 ) 

564 

565 

566@library_bp.route("/api/open-folder", methods=["POST"]) 

567@login_required 

568def open_folder(): 

569 """Open folder containing a document.""" 

570 data = request.json 

571 path = data.get("path") 

572 

573 if not path: 

574 return jsonify({"success": False, "error": "Path not provided"}) 

575 

576 try: 

577 # Get library root path from settings (uses centralized path, respects LDR_DATA_DIR) 

578 settings = get_settings_manager() 

579 library_root = ( 

580 Path( 

581 settings.get_setting( 

582 "research_library.storage_path", 

583 str(get_library_directory()), 

584 ) 

585 ) 

586 .expanduser() 

587 .resolve() 

588 ) 

589 

590 # Validate the path is within library root 

591 validated_path = PathValidator.validate_safe_path( 

592 path, library_root, allow_absolute=False 

593 ) 

594 

595 if not validated_path or not validated_path.exists(): 

596 return jsonify( 

597 {"success": False, "error": "Invalid or non-existent path"} 

598 ) 

599 

600 # Use centralized file location opener 

601 success = open_file_location(str(validated_path)) 

602 return jsonify({"success": success}) 

603 except ValueError as e: 

604 logger.warning(f"Path validation failed: {e}") 

605 return jsonify({"success": False, "error": "Invalid path"}) 

606 except Exception: 

607 logger.exception("Failed to open folder") 

608 return jsonify( 

609 {"success": False, "error": "An internal error has occurred."} 

610 ) 

611 

612 

613@library_bp.route("/api/download/<int:resource_id>", methods=["POST"]) 

614@login_required 

615def download_single_resource(resource_id): 

616 """Download a single resource.""" 

617 username = session.get("username") 

618 user_password, error_response = get_authenticated_user_password(username) 

619 if error_response: 

620 return error_response 

621 service = DownloadService(username, user_password) 

622 

623 success, error = service.download_resource(resource_id) 

624 if success: 

625 return jsonify({"success": True}) 

626 else: 

627 logger.warning(f"Download failed for resource {resource_id}: {error}") 

628 return jsonify( 

629 { 

630 "success": False, 

631 "error": "Download failed. Please try again or contact support.", 

632 } 

633 ), 500 

634 

635 

636@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"]) 

637@login_required 

638def download_text_single(resource_id): 

639 """Download a single resource as text file.""" 

640 try: 

641 username = session.get("username") 

642 user_password, error_response = get_authenticated_user_password( 

643 username 

644 ) 

645 if error_response: 

646 return error_response 

647 service = DownloadService(username, user_password) 

648 

649 success, error = service.download_as_text(resource_id) 

650 

651 # Sanitize error message - don't expose internal details 

652 if not success: 

653 if error: 

654 logger.warning( 

655 f"Download as text failed for resource {resource_id}: {error}" 

656 ) 

657 return jsonify( 

658 {"success": False, "error": "Failed to download resource"} 

659 ) 

660 

661 return jsonify({"success": True, "error": None}) 

662 except Exception as e: 

663 return handle_api_error( 

664 f"downloading resource {resource_id} as text", e 

665 ) 

666 

667 

668@library_bp.route("/api/download-all-text", methods=["POST"]) 

669@login_required 

670def download_all_text(): 

671 """Download all undownloaded resources as text files.""" 

672 username = session.get("username") 

673 # Capture Flask session ID to avoid scoping issues in nested function 

674 flask_session_id = session.get("session_id") 

675 

676 def generate(): 

677 # Get user password for database operations 

678 user_password, _ = get_authenticated_user_password( 

679 username, flask_session_id 

680 ) 

681 if user_password is None: 

682 logger.error(f"Could not get password for user {username}") 

683 return 

684 

685 download_service = DownloadService(username, user_password) 

686 

687 # Get all undownloaded resources 

688 with get_user_db_session(username) as session: 

689 # Get resources that don't have text files yet 

690 resources = session.query(ResearchResource).all() 

691 

692 # Filter resources that need text extraction 

693 txt_path = Path(download_service.library_root) / "txt" 

694 resources_to_process = [] 

695 

696 for resource in resources: 

697 # Check if text file already exists 

698 if txt_path.exists(): 

699 existing = list(txt_path.glob(f"*_{resource.id}.txt")) 

700 if not existing: 

701 resources_to_process.append(resource) 

702 else: 

703 resources_to_process.append(resource) 

704 

705 total = len(resources_to_process) 

706 current = 0 

707 

708 logger.info(f"Found {total} resources needing text extraction") 

709 

710 for resource in resources_to_process: 

711 current += 1 

712 progress = int((current / total) * 100) if total > 0 else 100 

713 

714 file_name = ( 

715 resource.title[:50] 

716 if resource 

717 else f"document_{current}.txt" 

718 ) 

719 

720 try: 

721 success, error = download_service.download_as_text( 

722 resource.id 

723 ) 

724 

725 if success: 

726 status = "success" 

727 error_msg = None 

728 else: 

729 status = "failed" 

730 error_msg = error or "Text extraction failed" 

731 

732 except Exception as e: 

733 logger.exception( 

734 f"Error extracting text for resource {resource.id}" 

735 ) 

736 status = "failed" 

737 error_msg = f"Text extraction failed - {type(e).__name__}" 

738 

739 # Send update 

740 update = { 

741 "progress": progress, 

742 "current": current, 

743 "total": total, 

744 "file": file_name, 

745 "url": resource.url, # Add the URL for UI display 

746 "status": status, 

747 "error": error_msg, 

748 } 

749 yield f"data: {json.dumps(update)}\n\n" 

750 

751 # Send completion 

752 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n" 

753 

754 return Response( 

755 stream_with_context(generate()), mimetype="text/event-stream" 

756 ) 

757 

758 

759@library_bp.route("/api/download-research/<research_id>", methods=["POST"]) 

760@login_required 

761def download_research_pdfs(research_id): 

762 """Queue all PDFs from a research session for download.""" 

763 username = session.get("username") 

764 user_password, error_response = get_authenticated_user_password(username) 

765 if error_response: 

766 return error_response 

767 service = DownloadService(username, user_password) 

768 

769 # Get optional collection_id from request body 

770 data = request.json or {} 

771 collection_id = data.get("collection_id") 

772 

773 queued = service.queue_research_downloads(research_id, collection_id) 

774 

775 # Start processing queue (in production, this would be a background task) 

776 # For now, we'll process synchronously 

777 # TODO: Integrate with existing queue processor 

778 

779 return jsonify({"success": True, "queued": queued}) 

780 

781 

782@library_bp.route("/api/download-bulk", methods=["POST"]) 

783@login_required 

784def download_bulk(): 

785 """Download PDFs or extract text from multiple research sessions.""" 

786 username = session.get("username") 

787 data = request.json 

788 research_ids = data.get("research_ids", []) 

789 mode = data.get("mode", "pdf") # pdf or text_only 

790 collection_id = data.get( 

791 "collection_id" 

792 ) # Optional: target collection for downloads 

793 

794 if not research_ids: 

795 return jsonify({"error": "No research IDs provided"}), 400 

796 

797 # Capture Flask session ID to avoid scoping issues in nested function 

798 flask_session_id = session.get("session_id") 

799 

800 def generate(): 

801 """Generate progress updates as Server-Sent Events.""" 

802 # Get user password for database operations 

803 user_password, _ = get_authenticated_user_password( 

804 username, flask_session_id 

805 ) 

806 if user_password is None: 

807 return 

808 

809 download_service = DownloadService(username, user_password) 

810 

811 # Count total pending queue items across all research IDs 

812 total = 0 

813 current = 0 

814 

815 with get_user_db_session(username) as session: 

816 for research_id in research_ids: 

817 count = ( 

818 session.query(LibraryDownloadQueue) 

819 .filter_by( 

820 research_id=research_id, status=DocumentStatus.PENDING 

821 ) 

822 .count() 

823 ) 

824 total += count 

825 logger.debug( 

826 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue" 

827 ) 

828 

829 logger.info( 

830 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}" 

831 ) 

832 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n" 

833 

834 # Process each research 

835 for research_id in research_ids: 

836 # Get queued downloads for this research 

837 with get_user_db_session(username) as session: 

838 # Get pending queue items for this research 

839 queue_items = ( 

840 session.query(LibraryDownloadQueue) 

841 .filter_by( 

842 research_id=research_id, status=DocumentStatus.PENDING 

843 ) 

844 .all() 

845 ) 

846 

847 # If no items queued yet, queue them now 

848 if not queue_items: 

849 try: 

850 download_service.queue_research_downloads( 

851 research_id, collection_id 

852 ) 

853 # Re-fetch queue items 

854 queue_items = ( 

855 session.query(LibraryDownloadQueue) 

856 .filter_by( 

857 research_id=research_id, status="pending" 

858 ) 

859 .all() 

860 ) 

861 except Exception: 

862 logger.exception( 

863 f"Error queueing downloads for research {research_id}" 

864 ) 

865 # Continue with empty queue_items 

866 queue_items = [] 

867 

868 # Process each queued item 

869 for queue_item in queue_items: 

870 logger.debug( 

871 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})" 

872 ) 

873 current += 1 

874 logger.debug( 

875 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})" 

876 ) 

877 

878 # Check for division issues 

879 if total is None: 

880 logger.error( 

881 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash" 

882 ) 

883 total = 0 

884 

885 progress = ( 

886 int((current / total) * 100) if total > 0 else 100 

887 ) 

888 logger.debug( 

889 f"[PROGRESS_DEBUG] Calculated progress: {progress}%" 

890 ) 

891 

892 # Get resource info 

893 resource = session.query(ResearchResource).get( 

894 queue_item.resource_id 

895 ) 

896 file_name = ( 

897 resource.title[:50] 

898 if resource 

899 else f"document_{current}.pdf" 

900 ) 

901 

902 # Attempt actual download with error handling 

903 skip_reason = None 

904 status = "skipped" # Default to skipped 

905 success = False 

906 error_msg = None 

907 

908 try: 

909 logger.debug( 

910 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}" 

911 ) 

912 

913 # Call appropriate service method based on mode 

914 if mode == "pdf": 

915 result = download_service.download_resource( 

916 queue_item.resource_id 

917 ) 

918 else: # text_only 

919 result = download_service.download_as_text( 

920 queue_item.resource_id 

921 ) 

922 

923 # Handle new tuple return format 

924 if isinstance(result, tuple): 

925 success, skip_reason = result 

926 else: 

927 success = result 

928 skip_reason = None 

929 

930 status = "success" if success else "skipped" 

931 if skip_reason and not success: 

932 error_msg = skip_reason 

933 logger.info( 

934 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}" 

935 ) 

936 

937 logger.debug( 

938 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}" 

939 ) 

940 except Exception as e: 

941 # Log error but continue processing 

942 error_msg = str(e) 

943 error_type = type(e).__name__ 

944 logger.info( 

945 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}" 

946 ) 

947 # Check if this is a skip reason (not a real error) 

948 # Use error category + categorized message for user display 

949 if any( 

950 phrase in error_msg.lower() 

951 for phrase in [ 

952 "paywall", 

953 "subscription", 

954 "not available", 

955 "not found", 

956 "no free", 

957 "embargoed", 

958 "forbidden", 

959 "not accessible", 

960 ] 

961 ): 

962 status = "skipped" 

963 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}" 

964 elif any( 

965 phrase in error_msg.lower() 

966 for phrase in [ 

967 "failed to download", 

968 "could not", 

969 "invalid", 

970 "server", 

971 ] 

972 ): 

973 status = "failed" 

974 skip_reason = f"Download failed - {error_type}" 

975 else: 

976 status = "failed" 

977 skip_reason = f"Processing failed - {error_type}" 

978 success = False 

979 

980 # Ensure skip_reason is set if we have an error message 

981 if error_msg and not skip_reason: 

982 skip_reason = f"Processing failed - {error_type}" 

983 logger.debug( 

984 f"Setting skip_reason from error_msg: {error_msg}" 

985 ) 

986 

987 # Send progress update 

988 update_data = { 

989 "progress": progress, 

990 "current": current, 

991 "total": total, 

992 "file": file_name, 

993 "status": status, 

994 } 

995 # Add skip reason if available 

996 if skip_reason: 

997 update_data["error"] = skip_reason 

998 logger.info(f"Sending skip reason to UI: {skip_reason}") 

999 

1000 logger.info(f"Update data being sent: {update_data}") 

1001 yield f"data: {json.dumps(update_data)}\n\n" 

1002 

1003 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n" 

1004 

1005 return Response( 

1006 stream_with_context(generate()), 

1007 mimetype="text/event-stream", 

1008 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, 

1009 ) 

1010 

1011 

1012@library_bp.route("/api/research-list") 

1013@login_required 

1014def get_research_list(): 

1015 """Get list of research sessions with download stats.""" 

1016 username = session.get("username") 

1017 service = LibraryService(username) 

1018 research_list = service.get_research_list_with_stats() 

1019 return jsonify({"research": research_list}) 

1020 

1021 

1022@library_bp.route("/api/sync-library", methods=["POST"]) 

1023@login_required 

1024def sync_library(): 

1025 """Sync library database with filesystem.""" 

1026 username = session.get("username") 

1027 service = LibraryService(username) 

1028 stats = service.sync_library_with_filesystem() 

1029 return jsonify(stats) 

1030 

1031 

1032@library_bp.route("/api/mark-redownload", methods=["POST"]) 

1033@login_required 

1034def mark_for_redownload(): 

1035 """Mark documents for re-download.""" 

1036 username = session.get("username") 

1037 service = LibraryService(username) 

1038 

1039 data = request.json 

1040 document_ids = data.get("document_ids", []) 

1041 

1042 if not document_ids: 

1043 return jsonify({"error": "No document IDs provided"}), 400 

1044 

1045 count = service.mark_for_redownload(document_ids) 

1046 return jsonify({"success": True, "marked": count}) 

1047 

1048 

1049@library_bp.route("/api/queue-all-undownloaded", methods=["POST"]) 

1050@login_required 

1051def queue_all_undownloaded(): 

1052 """Queue all articles that haven't been downloaded yet.""" 

1053 username = session.get("username") 

1054 

1055 logger.info(f"queue_all_undownloaded called for user {username}") 

1056 

1057 with get_user_db_session(username) as db_session: 

1058 # Find all resources that don't have a completed download 

1059 undownloaded = ( 

1060 db_session.query(ResearchResource) 

1061 .outerjoin( 

1062 Document, 

1063 (ResearchResource.id == Document.resource_id) 

1064 & (Document.status == "completed"), 

1065 ) 

1066 .filter(Document.id.is_(None)) 

1067 .all() 

1068 ) 

1069 

1070 logger.info(f"Found {len(undownloaded)} total undownloaded resources") 

1071 

1072 # Get user password for encrypted database access 

1073 user_password, error_response = get_authenticated_user_password( 

1074 username 

1075 ) 

1076 if error_response: 

1077 return error_response 

1078 

1079 resource_filter = ResourceFilter(username, user_password) 

1080 filter_results = resource_filter.filter_downloadable_resources( 

1081 undownloaded 

1082 ) 

1083 

1084 # Get detailed filtering summary 

1085 filter_summary = resource_filter.get_filter_summary(undownloaded) 

1086 skipped_info = resource_filter.get_skipped_resources_info(undownloaded) 

1087 

1088 logger.info(f"Filter results: {filter_summary.to_dict()}") 

1089 

1090 queued_count = 0 

1091 research_ids = set() 

1092 skipped_count = 0 

1093 

1094 for resource in undownloaded: 

1095 # Check if resource passed the smart filter 

1096 filter_result = next( 

1097 (r for r in filter_results if r.resource_id == resource.id), 

1098 None, 

1099 ) 

1100 

1101 if not filter_result or not filter_result.can_retry: 

1102 skipped_count += 1 

1103 if filter_result: 

1104 logger.debug( 

1105 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}" 

1106 ) 

1107 else: 

1108 logger.debug( 

1109 f"Skipping resource {resource.id} - no filter result available" 

1110 ) 

1111 continue 

1112 

1113 # Check if it's downloadable using proper URL parsing 

1114 if not resource.url: 

1115 skipped_count += 1 

1116 continue 

1117 

1118 is_downloadable = is_downloadable_domain(resource.url) 

1119 

1120 # Log what we're checking 

1121 if resource.url and "pubmed" in resource.url.lower(): 

1122 logger.info(f"Found PubMed URL: {resource.url[:100]}") 

1123 

1124 if not is_downloadable: 

1125 skipped_count += 1 

1126 logger.debug( 

1127 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}" 

1128 ) 

1129 continue 

1130 

1131 # Check if already in queue (any status) 

1132 existing_queue = ( 

1133 db_session.query(LibraryDownloadQueue) 

1134 .filter_by(resource_id=resource.id) 

1135 .first() 

1136 ) 

1137 

1138 if existing_queue: 

1139 # If it exists but isn't pending, reset it to pending 

1140 if existing_queue.status != DocumentStatus.PENDING: 

1141 existing_queue.status = DocumentStatus.PENDING 

1142 existing_queue.completed_at = None 

1143 queued_count += 1 

1144 research_ids.add(resource.research_id) 

1145 logger.debug( 

1146 f"Reset queue entry for resource {resource.id} to pending" 

1147 ) 

1148 else: 

1149 # Already pending, still count it 

1150 queued_count += 1 

1151 research_ids.add(resource.research_id) 

1152 logger.debug( 

1153 f"Resource {resource.id} already pending in queue" 

1154 ) 

1155 else: 

1156 # Add new entry to queue 

1157 queue_entry = LibraryDownloadQueue( 

1158 resource_id=resource.id, 

1159 research_id=resource.research_id, 

1160 priority=0, 

1161 status=DocumentStatus.PENDING, 

1162 ) 

1163 db_session.add(queue_entry) 

1164 queued_count += 1 

1165 research_ids.add(resource.research_id) 

1166 logger.debug( 

1167 f"Added new queue entry for resource {resource.id}" 

1168 ) 

1169 

1170 db_session.commit() 

1171 

1172 logger.info( 

1173 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)" 

1174 ) 

1175 

1176 # Note: Removed synchronous download processing here to avoid blocking the HTTP request 

1177 # Downloads will be processed via the SSE streaming endpoint or background tasks 

1178 

1179 return jsonify( 

1180 { 

1181 "success": True, 

1182 "queued": queued_count, 

1183 "research_ids": list(research_ids), 

1184 "total_undownloaded": len(undownloaded), 

1185 "skipped": skipped_count, 

1186 "filter_summary": filter_summary.to_dict(), 

1187 "skipped_details": skipped_info, 

1188 } 

1189 ) 

1190 

1191 

1192@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"]) 

1193@login_required 

1194def get_research_sources(research_id): 

1195 """Get all sources for a research with snippets.""" 

1196 username = session.get("username") 

1197 

1198 sources = [] 

1199 with get_user_db_session(username) as db_session: 

1200 # Get all resources for this research 

1201 resources = ( 

1202 db_session.query(ResearchResource) 

1203 .filter_by(research_id=research_id) 

1204 .order_by(ResearchResource.created_at) 

1205 .all() 

1206 ) 

1207 

1208 for idx, resource in enumerate(resources, 1): 

1209 # Check if document exists 

1210 document = ( 

1211 db_session.query(Document) 

1212 .filter_by(resource_id=resource.id) 

1213 .first() 

1214 ) 

1215 

1216 # Get domain from URL 

1217 domain = "" 

1218 if resource.url: 

1219 try: 

1220 from urllib.parse import urlparse 

1221 

1222 domain = urlparse(resource.url).hostname or "" 

1223 except: 

1224 pass 

1225 

1226 source_data = { 

1227 "number": idx, 

1228 "resource_id": resource.id, 

1229 "url": resource.url, 

1230 "title": resource.title or f"Source {idx}", 

1231 "snippet": resource.content_preview or "", 

1232 "domain": domain, 

1233 "relevance_score": getattr(resource, "relevance_score", None), 

1234 "downloaded": False, 

1235 "document_id": None, 

1236 "file_type": None, 

1237 } 

1238 

1239 if document and document.status == "completed": 

1240 source_data.update( 

1241 { 

1242 "downloaded": True, 

1243 "document_id": document.id, 

1244 "file_type": document.file_type, 

1245 "download_date": document.created_at.isoformat() 

1246 if document.created_at 

1247 else None, 

1248 } 

1249 ) 

1250 

1251 sources.append(source_data) 

1252 

1253 return jsonify({"success": True, "sources": sources, "total": len(sources)}) 

1254 

1255 

1256@library_bp.route("/api/check-downloads", methods=["POST"]) 

1257@login_required 

1258def check_downloads(): 

1259 """Check download status for a list of URLs.""" 

1260 username = session.get("username") 

1261 data = request.json 

1262 research_id = data.get("research_id") 

1263 urls = data.get("urls", []) 

1264 

1265 if not research_id or not urls: 

1266 return jsonify({"error": "Missing research_id or urls"}), 400 

1267 

1268 download_status = {} 

1269 

1270 with get_user_db_session(username) as db_session: 

1271 # Get all resources for this research 

1272 resources = ( 

1273 db_session.query(ResearchResource) 

1274 .filter_by(research_id=research_id) 

1275 .filter(ResearchResource.url.in_(urls)) 

1276 .all() 

1277 ) 

1278 

1279 for resource in resources: 

1280 # Check if document exists 

1281 document = ( 

1282 db_session.query(Document) 

1283 .filter_by(resource_id=resource.id) 

1284 .first() 

1285 ) 

1286 

1287 if document and document.status == "completed": 

1288 download_status[resource.url] = { 

1289 "downloaded": True, 

1290 "document_id": document.id, 

1291 "file_path": document.file_path, 

1292 "file_type": document.file_type, 

1293 "title": document.title or resource.title, 

1294 } 

1295 else: 

1296 download_status[resource.url] = { 

1297 "downloaded": False, 

1298 "resource_id": resource.id, 

1299 } 

1300 

1301 return jsonify({"download_status": download_status}) 

1302 

1303 

1304@library_bp.route("/api/download-source", methods=["POST"]) 

1305@login_required 

1306def download_source(): 

1307 """Download a single source from a research.""" 

1308 username = session.get("username") 

1309 user_password, error_response = get_authenticated_user_password(username) 

1310 if error_response: 

1311 return error_response 

1312 data = request.json 

1313 research_id = data.get("research_id") 

1314 url = data.get("url") 

1315 

1316 if not research_id or not url: 

1317 return jsonify({"error": "Missing research_id or url"}), 400 

1318 

1319 # Check if URL is downloadable 

1320 if not is_downloadable_domain(url): 

1321 return jsonify({"error": "URL is not from a downloadable domain"}), 400 

1322 

1323 with get_user_db_session(username) as db_session: 

1324 # Find the resource 

1325 resource = ( 

1326 db_session.query(ResearchResource) 

1327 .filter_by(research_id=research_id, url=url) 

1328 .first() 

1329 ) 

1330 

1331 if not resource: 

1332 return jsonify({"error": "Resource not found"}), 404 

1333 

1334 # Check if already downloaded 

1335 existing = ( 

1336 db_session.query(Document) 

1337 .filter_by(resource_id=resource.id) 

1338 .first() 

1339 ) 

1340 

1341 if existing and existing.download_status == "completed": 

1342 return jsonify( 

1343 { 

1344 "success": True, 

1345 "message": "Already downloaded", 

1346 "document_id": existing.id, 

1347 } 

1348 ) 

1349 

1350 # Add to download queue 

1351 queue_entry = ( 

1352 db_session.query(LibraryDownloadQueue) 

1353 .filter_by(resource_id=resource.id) 

1354 .first() 

1355 ) 

1356 

1357 if not queue_entry: 

1358 queue_entry = LibraryDownloadQueue( 

1359 resource_id=resource.id, 

1360 research_id=resource.research_id, 

1361 priority=1, # Higher priority for manual downloads 

1362 status=DocumentStatus.PENDING, 

1363 ) 

1364 db_session.add(queue_entry) 

1365 else: 

1366 queue_entry.status = DocumentStatus.PENDING 

1367 queue_entry.priority = 1 

1368 

1369 db_session.commit() 

1370 

1371 # Start download immediately 

1372 service = DownloadService(username, user_password) 

1373 success, message = service.download_resource(resource.id) 

1374 

1375 if success: 

1376 return jsonify({"success": True, "message": "Download completed"}) 

1377 else: 

1378 # Log internal message, but show only generic message to user 

1379 return jsonify({"success": False, "message": "Download failed"})