Coverage for src/local_deep_research/research_library/routes/library_routes.py: 94%

550 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Routes for Research Library and Download Manager 

3 

4Provides web endpoints for: 

5- Library browsing and management 

6- Download manager interface 

7- API endpoints for downloads and queries 

8""" 

9 

10import json 

11import math 

12from io import BytesIO 

13from pathlib import Path 

14from flask import ( 

15 Blueprint, 

16 g, 

17 jsonify, 

18 request, 

19 session, 

20 Response, 

21 send_file, 

22 stream_with_context, 

23) 

24from loguru import logger 

25 

26from ...security.decorators import require_json_body 

27from ...web.auth.decorators import login_required 

28from ...web.utils.templates import render_template_with_defaults 

29from ...database.session_context import get_user_db_session, safe_rollback 

30from ...database.models.research import ResearchResource 

31from ...database.models.library import ( 

32 Document as Document, 

33 DocumentStatus, 

34 DownloadQueue as LibraryDownloadQueue, 

35 Collection, 

36) 

37from ...library.download_management import ResourceFilter 

38from ..services.download_service import DownloadService 

39from ..services.library_service import LibraryService 

40from ..services.pdf_storage_manager import PDFStorageManager 

41from ..utils import ( 

42 get_document_for_resource, 

43 handle_api_error, 

44 is_downloadable_domain, 

45 is_downloadable_url, 

46) 

47from ...utilities.db_utils import get_settings_manager 

48from ...config.paths import get_library_directory 

49from ...web.exceptions import AuthenticationRequiredError 

50 

51# Create Blueprint 

52library_bp = Blueprint("library", __name__, url_prefix="/library") 

53 

54# NOTE: Routes use session["username"] (not .get()) intentionally. 

55# @login_required guarantees the key exists; direct access fails fast 

56# if the decorator is ever removed. 

57 

58 

59# Error handler for authentication errors 

60@library_bp.errorhandler(Exception) 

61def handle_web_api_exception(error): 

62 """Handle WebAPIException and its subclasses.""" 

63 from ...web.exceptions import WebAPIException 

64 

65 if isinstance(error, WebAPIException): 

66 return jsonify(error.to_dict()), error.status_code 

67 # Re-raise other exceptions 

68 raise error 

69 

70 

71def get_authenticated_user_password( 

72 username: str, flask_session_id: str | None = None 

73) -> str: 

74 """ 

75 Get authenticated user password from session store with fallback to g.user_password. 

76 

77 Args: 

78 username: The username to get password for 

79 flask_session_id: Optional Flask session ID. If not provided, uses session.get("session_id") 

80 

81 Returns: 

82 str: The user's password 

83 

84 Raises: 

85 AuthenticationRequiredError: If no password is available for the user 

86 """ 

87 from ...database.session_passwords import session_password_store 

88 

89 session_id = flask_session_id or session.get("session_id") 

90 

91 # Try session password store first 

92 try: 

93 user_password = session_password_store.get_session_password( 

94 username, session_id 

95 ) 

96 if user_password: 

97 logger.debug( 

98 f"Retrieved user password from session store for user {username}" 

99 ) 

100 return user_password 

101 except Exception: 

102 logger.exception("Failed to get user password from session store") 

103 

104 # Fallback to g.user_password (set by middleware if temp_auth was used) 

105 user_password = getattr(g, "user_password", None) 

106 if user_password: 

107 logger.debug( 

108 f"Retrieved user password from g.user_password fallback for user {username}" 

109 ) 

110 return user_password 

111 

112 # No password available 

113 logger.error(f"No user password available for user {username}") 

114 raise AuthenticationRequiredError( 

115 message="Authentication required: Please refresh the page and log in again to access encrypted database features.", 

116 ) 

117 

118 

119# ============= Page Routes ============= 

120 

121 

122@library_bp.route("/") 

123@login_required 

124def library_page(): 

125 """Main library page showing downloaded documents.""" 

126 username = session["username"] 

127 service = LibraryService(username) 

128 

129 # Get library settings 

130 from ...utilities.db_utils import get_settings_manager 

131 

132 settings = get_settings_manager() 

133 pdf_storage_mode = settings.get_setting( 

134 "research_library.pdf_storage_mode", "database" 

135 ) 

136 # Enable PDF storage button if mode is not "none" 

137 enable_pdf_storage = pdf_storage_mode != "none" 

138 shared_library = settings.get_setting( 

139 "research_library.shared_library", False 

140 ) 

141 

142 # Get statistics 

143 stats = service.get_library_stats() 

144 

145 # Get documents with optional filters 

146 domain_filter = request.args.get("domain") 

147 research_filter = request.args.get("research") 

148 collection_filter = request.args.get("collection") # New collection filter 

149 date_filter = request.args.get("date") 

150 

151 # Resolve collection_id once to avoid redundant DB lookups 

152 from ...database.library_init import get_default_library_id 

153 

154 resolved_collection = collection_filter or get_default_library_id(username) 

155 

156 # Pagination 

157 per_page = 100 

158 total_docs = service.count_documents( 

159 research_id=research_filter, 

160 domain=domain_filter, 

161 collection_id=resolved_collection, 

162 date_filter=date_filter, 

163 ) 

164 total_pages = max(1, math.ceil(total_docs / per_page)) 

165 page = request.args.get("page", 1, type=int) 

166 page = max(1, min(page, total_pages)) 

167 offset = (page - 1) * per_page 

168 

169 documents = service.get_documents( 

170 research_id=research_filter, 

171 domain=domain_filter, 

172 collection_id=resolved_collection, 

173 date_filter=date_filter, 

174 limit=per_page, 

175 offset=offset, 

176 ) 

177 

178 # Get unique domains for filter dropdown 

179 unique_domains = service.get_unique_domains() 

180 

181 # Get research list for filter dropdown 

182 research_list = service.get_research_list_for_dropdown() 

183 

184 # Get collections list for filter dropdown 

185 collections = service.get_all_collections() 

186 

187 # Find default library collection ID for semantic search 

188 default_collection_id = next( 

189 (c["id"] for c in collections if c.get("is_default")), None 

190 ) 

191 

192 return render_template_with_defaults( 

193 "pages/library.html", 

194 stats=stats, 

195 documents=documents, 

196 unique_domains=unique_domains, 

197 research_list=research_list, 

198 collections=collections, 

199 selected_collection=collection_filter, 

200 default_collection_id=default_collection_id, 

201 storage_path=stats.get("storage_path", ""), 

202 enable_pdf_storage=enable_pdf_storage, 

203 pdf_storage_mode=pdf_storage_mode, 

204 shared_library=shared_library, 

205 page=page, 

206 total_pages=total_pages, 

207 selected_date=date_filter, 

208 selected_research=research_filter, 

209 selected_domain=domain_filter, 

210 ) 

211 

212 

213@library_bp.route("/document/<string:document_id>") 

214@login_required 

215def document_details_page(document_id): 

216 """Document details page showing all metadata and links.""" 

217 username = session["username"] 

218 service = LibraryService(username) 

219 

220 # Get document details 

221 document = service.get_document_by_id(document_id) 

222 

223 if not document: 

224 return "Document not found", 404 

225 

226 return render_template_with_defaults( 

227 "pages/document_details.html", document=document 

228 ) 

229 

230 

231@library_bp.route("/download-manager") 

232@login_required 

233def download_manager_page(): 

234 """Download manager page for selecting and downloading research PDFs.""" 

235 username = session["username"] 

236 service = LibraryService(username) 

237 

238 # Get library settings 

239 from ...utilities.db_utils import get_settings_manager 

240 

241 settings = get_settings_manager() 

242 pdf_storage_mode = settings.get_setting( 

243 "research_library.pdf_storage_mode", "database" 

244 ) 

245 # Enable PDF storage button if mode is not "none" 

246 enable_pdf_storage = pdf_storage_mode != "none" 

247 shared_library = settings.get_setting( 

248 "research_library.shared_library", False 

249 ) 

250 

251 # Summary stats over ALL sessions (also used for page count) 

252 per_page = 50 

253 summary = service.get_download_manager_summary_stats() 

254 total_pages = max(1, math.ceil(summary["total_researches"] / per_page)) 

255 

256 # Pagination with upper-bound clamp 

257 page = request.args.get("page", 1, type=int) 

258 page = max(1, min(page, total_pages)) 

259 offset = (page - 1) * per_page 

260 

261 # Get paginated research sessions 

262 research_list = service.get_research_list_with_stats( 

263 limit=per_page, offset=offset 

264 ) 

265 

266 # Batch-fetch PDF previews and domain breakdowns (single query) 

267 research_ids = [r["id"] for r in research_list] 

268 previews = service.get_pdf_previews_batch(research_ids) 

269 for research in research_list: 

270 rid = research["id"] 

271 data = previews.get(rid, {"pdf_sources": [], "domains": {}}) 

272 research["pdf_sources"] = data["pdf_sources"] 

273 research["domains"] = data["domains"] 

274 

275 return render_template_with_defaults( 

276 "pages/download_manager.html", 

277 research_list=research_list, 

278 total_researches=summary["total_researches"], 

279 total_resources=summary["total_resources"], 

280 already_downloaded=summary["already_downloaded"], 

281 available_to_download=summary["available_to_download"], 

282 enable_pdf_storage=enable_pdf_storage, 

283 pdf_storage_mode=pdf_storage_mode, 

284 shared_library=shared_library, 

285 page=page, 

286 total_pages=total_pages, 

287 ) 

288 

289 

290# ============= API Routes ============= 

291 

292 

293@library_bp.route("/api/stats") 

294@login_required 

295def get_library_stats(): 

296 """Get library statistics.""" 

297 username = session["username"] 

298 service = LibraryService(username) 

299 stats = service.get_library_stats() 

300 return jsonify(stats) 

301 

302 

303@library_bp.route("/api/collections/list") 

304@login_required 

305def get_collections_list(): 

306 """Get list of all collections for dropdown selection.""" 

307 username = session["username"] 

308 

309 with get_user_db_session(username) as db_session: 

310 collections = ( 

311 db_session.query(Collection).order_by(Collection.name).all() 

312 ) 

313 

314 return jsonify( 

315 { 

316 "success": True, 

317 "collections": [ 

318 { 

319 "id": col.id, 

320 "name": col.name, 

321 "description": col.description, 

322 } 

323 for col in collections 

324 ], 

325 } 

326 ) 

327 

328 

329@library_bp.route("/api/documents") 

330@login_required 

331def get_documents(): 

332 """Get documents with filtering.""" 

333 username = session["username"] 

334 service = LibraryService(username) 

335 

336 # Get filter parameters 

337 research_id = request.args.get("research_id") 

338 domain = request.args.get("domain") 

339 file_type = request.args.get("file_type") 

340 favorites_only = request.args.get("favorites") == "true" 

341 search_query = request.args.get("search") 

342 limit = int(request.args.get("limit", 100)) 

343 offset = int(request.args.get("offset", 0)) 

344 

345 documents = service.get_documents( 

346 research_id=research_id, 

347 domain=domain, 

348 file_type=file_type, 

349 favorites_only=favorites_only, 

350 search_query=search_query, 

351 limit=limit, 

352 offset=offset, 

353 ) 

354 

355 return jsonify({"documents": documents}) 

356 

357 

358@library_bp.route( 

359 "/api/document/<string:document_id>/favorite", methods=["POST"] 

360) 

361@login_required 

362def toggle_favorite(document_id): 

363 """Toggle favorite status of a document.""" 

364 username = session["username"] 

365 service = LibraryService(username) 

366 is_favorite = service.toggle_favorite(document_id) 

367 return jsonify({"favorite": is_favorite}) 

368 

369 

370@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"]) 

371@login_required 

372def delete_document(document_id): 

373 """Delete a document from library.""" 

374 username = session["username"] 

375 service = LibraryService(username) 

376 success = service.delete_document(document_id) 

377 return jsonify({"success": success}) 

378 

379 

380@library_bp.route("/api/document/<string:document_id>/pdf-url") 

381@login_required 

382def get_pdf_url(document_id): 

383 """Get URL for viewing PDF.""" 

384 # Return URL that will serve the PDF 

385 return jsonify( 

386 { 

387 "url": f"/library/api/document/{document_id}/pdf", 

388 "title": "Document", # Could fetch actual title 

389 } 

390 ) 

391 

392 

393@library_bp.route("/document/<string:document_id>/pdf") 

394@login_required 

395def view_pdf_page(document_id): 

396 """Page for viewing PDF file - uses PDFStorageManager for retrieval.""" 

397 username = session["username"] 

398 

399 with get_user_db_session(username) as db_session: 

400 # Get document from database 

401 document = db_session.query(Document).filter_by(id=document_id).first() 

402 

403 if not document: 

404 logger.warning( 

405 f"Document ID {document_id} not found in database for user {username}" 

406 ) 

407 return "Document not found", 404 

408 

409 logger.info( 

410 f"Document {document_id}: title='{document.title}', " 

411 f"file_path={document.file_path}" 

412 ) 

413 

414 # Get settings for PDF storage manager 

415 settings = get_settings_manager(db_session) 

416 storage_mode = settings.get_setting( 

417 "research_library.pdf_storage_mode", "none" 

418 ) 

419 library_root = ( 

420 Path( 

421 settings.get_setting( 

422 "research_library.storage_path", 

423 str(get_library_directory()), 

424 ) 

425 ) 

426 .expanduser() 

427 .resolve() 

428 ) 

429 

430 # Use PDFStorageManager to load PDF (handles database and filesystem) 

431 pdf_manager = PDFStorageManager(library_root, storage_mode) 

432 pdf_bytes = pdf_manager.load_pdf(document, db_session) 

433 

434 if pdf_bytes: 

435 logger.info( 

436 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)" 

437 ) 

438 return send_file( 

439 BytesIO(pdf_bytes), 

440 mimetype="application/pdf", 

441 as_attachment=False, 

442 download_name=document.filename or "document.pdf", 

443 ) 

444 

445 # No PDF found anywhere 

446 logger.warning(f"No PDF available for document {document_id}") 

447 return "PDF not available", 404 

448 

449 

450@library_bp.route("/api/document/<string:document_id>/pdf") 

451@login_required 

452def serve_pdf_api(document_id): 

453 """API endpoint for serving PDF file (kept for backward compatibility).""" 

454 return view_pdf_page(document_id) 

455 

456 

457@library_bp.route("/document/<string:document_id>/txt") 

458@login_required 

459def view_text_page(document_id): 

460 """Page for viewing text content.""" 

461 username = session["username"] 

462 

463 with get_user_db_session(username) as db_session: 

464 # Get document by ID (text now stored in Document.text_content) 

465 document = db_session.query(Document).filter_by(id=document_id).first() 

466 

467 if not document: 

468 logger.warning(f"Document not found for document ID {document_id}") 

469 return "Document not found", 404 

470 

471 if not document.text_content: 

472 logger.warning(f"Document {document_id} has no text content") 

473 return "Text content not available", 404 

474 

475 logger.info( 

476 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

477 ) 

478 

479 # Render as HTML page 

480 return render_template_with_defaults( 

481 "pages/document_text.html", 

482 document_id=document_id, 

483 title=document.title or "Document Text", 

484 text_content=document.text_content, 

485 extraction_method=document.extraction_method, 

486 word_count=document.word_count, 

487 ) 

488 

489 

490@library_bp.route("/api/document/<string:document_id>/text") 

491@login_required 

492def serve_text_api(document_id): 

493 """API endpoint for serving text content (kept for backward compatibility).""" 

494 username = session["username"] 

495 

496 with get_user_db_session(username) as db_session: 

497 # Get document by ID (text now stored in Document.text_content) 

498 document = db_session.query(Document).filter_by(id=document_id).first() 

499 

500 if not document: 

501 logger.warning(f"Document not found for document ID {document_id}") 

502 return jsonify({"error": "Document not found"}), 404 

503 

504 if not document.text_content: 

505 logger.warning(f"Document {document_id} has no text content") 

506 return jsonify({"error": "Text content not available"}), 404 

507 

508 logger.info( 

509 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

510 ) 

511 

512 return jsonify( 

513 { 

514 "text_content": document.text_content, 

515 "title": document.title or "Document", 

516 "extraction_method": document.extraction_method, 

517 "word_count": document.word_count, 

518 } 

519 ) 

520 

521 

522@library_bp.route("/api/open-folder", methods=["POST"]) 

523@login_required 

524def open_folder(): 

525 """Open folder containing a document. 

526 

527 Security: This endpoint is disabled for server deployments. 

528 It only makes sense for desktop usage where the server and client are on the same machine. 

529 """ 

530 return jsonify( 

531 { 

532 "status": "error", 

533 "message": "This feature is disabled. It is only available in desktop mode.", 

534 } 

535 ), 403 

536 

537 

538@library_bp.route("/api/download/<int:resource_id>", methods=["POST"]) 

539@login_required 

540def download_single_resource(resource_id): 

541 """Download a single resource.""" 

542 username = session["username"] 

543 user_password = get_authenticated_user_password(username) 

544 

545 with DownloadService(username, user_password) as service: 

546 success, error = service.download_resource(resource_id) 

547 if success: 

548 return jsonify({"success": True}) 

549 logger.warning(f"Download failed for resource {resource_id}: {error}") 

550 return jsonify( 

551 { 

552 "success": False, 

553 "error": "Download failed. Please try again or contact support.", 

554 } 

555 ), 500 

556 

557 

558@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"]) 

559@login_required 

560def download_text_single(resource_id): 

561 """Download a single resource as text file.""" 

562 try: 

563 username = session["username"] 

564 user_password = get_authenticated_user_password(username) 

565 

566 with DownloadService(username, user_password) as service: 

567 success, error = service.download_as_text(resource_id) 

568 

569 # Sanitize error message - don't expose internal details 

570 if not success: 

571 if error: 571 ↛ 575line 571 didn't jump to line 575 because the condition on line 571 was always true

572 logger.warning( 

573 f"Download as text failed for resource {resource_id}: {error}" 

574 ) 

575 return jsonify( 

576 {"success": False, "error": "Failed to download resource"} 

577 ) 

578 

579 return jsonify({"success": True, "error": None}) 

580 except AuthenticationRequiredError: 

581 raise # Let blueprint error handler return 401 

582 except Exception as e: 

583 return handle_api_error( 

584 f"downloading resource {resource_id} as text", e 

585 ) 

586 

587 

588@library_bp.route("/api/download-all-text", methods=["POST"]) 

589@login_required 

590def download_all_text(): 

591 """Download all undownloaded resources as text files.""" 

592 username = session["username"] 

593 # Capture Flask session ID to avoid scoping issues in nested function 

594 flask_session_id = session.get("session_id") 

595 

596 def generate(): 

597 # Get user password for database operations 

598 try: 

599 user_password = get_authenticated_user_password( 

600 username, flask_session_id 

601 ) 

602 except AuthenticationRequiredError: 

603 logger.warning( 

604 f"Authentication unavailable for user {username} - password not in session store" 

605 ) 

606 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n" 

607 return 

608 

609 download_service = DownloadService(username, user_password) 

610 try: 

611 # Get all undownloaded resources 

612 with get_user_db_session(username) as session: 

613 # Get resources that don't have text files yet 

614 all_resources = session.query(ResearchResource).all() 

615 # Filter to only downloadable resources (academic/PDF) 

616 resources = [ 

617 r for r in all_resources if is_downloadable_url(r.url) 

618 ] 

619 

620 # Filter resources that need text extraction 

621 txt_path = Path(download_service.library_root) / "txt" 

622 resources_to_process = [] 

623 

624 # Pre-scan directory once to get all existing resource IDs 

625 existing_resource_ids = set() 

626 if txt_path.exists(): 626 ↛ 627line 626 didn't jump to line 627 because the condition on line 626 was never true

627 for txt_file in txt_path.glob("*.txt"): 

628 # Extract resource ID from filename pattern *_{id}.txt 

629 parts = txt_file.stem.rsplit("_", 1) 

630 if len(parts) == 2: 

631 try: 

632 existing_resource_ids.add(int(parts[1])) 

633 except ValueError: 

634 pass 

635 

636 for resource in resources: 

637 # Check if text file already exists using preloaded set 

638 if resource.id not in existing_resource_ids: 638 ↛ 636line 638 didn't jump to line 636 because the condition on line 638 was always true

639 resources_to_process.append(resource) 

640 

641 total = len(resources_to_process) 

642 current = 0 

643 

644 logger.info(f"Found {total} resources needing text extraction") 

645 

646 for resource in resources_to_process: 

647 current += 1 

648 progress = ( 

649 int((current / total) * 100) if total > 0 else 100 

650 ) 

651 

652 file_name = ( 

653 resource.title[:50] 

654 if resource 

655 else f"document_{current}.txt" 

656 ) 

657 

658 try: 

659 success, error = download_service.download_as_text( 

660 resource.id 

661 ) 

662 

663 if success: 

664 status = "success" 

665 error_msg = None 

666 else: 

667 status = "failed" 

668 error_msg = error or "Text extraction failed" 

669 

670 except Exception as e: 

671 logger.exception( 

672 f"Error extracting text for resource {resource.id}" 

673 ) 

674 status = "failed" 

675 error_msg = ( 

676 f"Text extraction failed - {type(e).__name__}" 

677 ) 

678 

679 # Send update 

680 update = { 

681 "progress": progress, 

682 "current": current, 

683 "total": total, 

684 "file": file_name, 

685 "url": resource.url, # Add the URL for UI display 

686 "status": status, 

687 "error": error_msg, 

688 } 

689 yield f"data: {json.dumps(update)}\n\n" 

690 

691 # Send completion 

692 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n" 

693 finally: 

694 from ...utilities.resource_utils import safe_close 

695 

696 safe_close(download_service, "download service") 

697 

698 return Response( 

699 stream_with_context(generate()), mimetype="text/event-stream" 

700 ) 

701 

702 

703@library_bp.route("/api/download-research/<research_id>", methods=["POST"]) 

704@login_required 

705def download_research_pdfs(research_id): 

706 """Queue all PDFs from a research session for download.""" 

707 username = session["username"] 

708 user_password = get_authenticated_user_password(username) 

709 

710 with DownloadService(username, user_password) as service: 

711 # Get optional collection_id from request body 

712 data = request.json or {} 

713 collection_id = data.get("collection_id") 

714 

715 queued = service.queue_research_downloads(research_id, collection_id) 

716 

717 # Start processing queue (in production, this would be a background task) 

718 # For now, we'll process synchronously 

719 # TODO: Integrate with existing queue processor 

720 

721 return jsonify({"success": True, "queued": queued}) 

722 

723 

724@library_bp.route("/api/download-bulk", methods=["POST"]) 

725@login_required 

726@require_json_body() 

727def download_bulk(): 

728 """Download PDFs or extract text from multiple research sessions.""" 

729 username = session["username"] 

730 data = request.json 

731 research_ids = data.get("research_ids", []) 

732 mode = data.get("mode", "pdf") # pdf or text_only 

733 collection_id = data.get( 

734 "collection_id" 

735 ) # Optional: target collection for downloads 

736 

737 if not research_ids: 

738 return jsonify({"error": "No research IDs provided"}), 400 

739 

740 # Capture Flask session ID to avoid scoping issues in nested function 

741 flask_session_id = session.get("session_id") 

742 

743 def generate(): 

744 """Generate progress updates as Server-Sent Events.""" 

745 # Get user password for database operations 

746 try: 

747 user_password = get_authenticated_user_password( 

748 username, flask_session_id 

749 ) 

750 except AuthenticationRequiredError: 

751 logger.warning( 

752 f"Authentication unavailable for user {username} - password not in session store" 

753 ) 

754 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n" 

755 return 

756 

757 download_service = DownloadService(username, user_password) 

758 try: 

759 # Count total pending queue items across all research IDs 

760 total = 0 

761 current = 0 

762 

763 with get_user_db_session(username) as session: 

764 for research_id in research_ids: 

765 count = ( 

766 session.query(LibraryDownloadQueue) 

767 .filter_by( 

768 research_id=research_id, 

769 status=DocumentStatus.PENDING, 

770 ) 

771 .count() 

772 ) 

773 total += count 

774 logger.debug( 

775 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue" 

776 ) 

777 

778 logger.info( 

779 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}" 

780 ) 

781 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n" 

782 

783 # Process each research 

784 for research_id in research_ids: 

785 # Get queued downloads for this research 

786 with get_user_db_session(username) as session: 

787 # Get pending queue items for this research 

788 queue_items = ( 

789 session.query(LibraryDownloadQueue) 

790 .filter_by( 

791 research_id=research_id, 

792 status=DocumentStatus.PENDING, 

793 ) 

794 .all() 

795 ) 

796 

797 # If no items queued yet, queue them now 

798 if not queue_items: 

799 try: 

800 download_service.queue_research_downloads( 

801 research_id, collection_id 

802 ) 

803 # Re-fetch queue items 

804 queue_items = ( 

805 session.query(LibraryDownloadQueue) 

806 .filter_by( 

807 research_id=research_id, status="pending" 

808 ) 

809 .all() 

810 ) 

811 except Exception: 

812 logger.exception( 

813 f"Error queueing downloads for research {research_id}" 

814 ) 

815 # Continue with empty queue_items 

816 queue_items = [] 

817 

818 # Process each queued item 

819 for queue_item in queue_items: 

820 logger.debug( 

821 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})" 

822 ) 

823 current += 1 

824 logger.debug( 

825 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})" 

826 ) 

827 

828 # Check for division issues 

829 if total is None: 829 ↛ 830line 829 didn't jump to line 830 because the condition on line 829 was never true

830 logger.error( 

831 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash" 

832 ) 

833 total = 0 

834 

835 progress = ( 

836 int((current / total) * 100) if total > 0 else 100 

837 ) 

838 logger.debug( 

839 f"[PROGRESS_DEBUG] Calculated progress: {progress}%" 

840 ) 

841 

842 # Get resource info 

843 resource = session.query(ResearchResource).get( 

844 queue_item.resource_id 

845 ) 

846 file_name = ( 

847 resource.title[:50] 

848 if resource 

849 else f"document_{current}.pdf" 

850 ) 

851 

852 # Attempt actual download with error handling 

853 skip_reason = None 

854 status = "skipped" # Default to skipped 

855 success = False 

856 error_msg = None 

857 

858 try: 

859 logger.debug( 

860 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}" 

861 ) 

862 

863 # Call appropriate service method based on mode 

864 if mode == "pdf": 

865 result = download_service.download_resource( 

866 queue_item.resource_id 

867 ) 

868 else: # text_only 

869 result = download_service.download_as_text( 

870 queue_item.resource_id 

871 ) 

872 

873 # Handle new tuple return format 

874 if isinstance(result, tuple): 874 ↛ 877line 874 didn't jump to line 877 because the condition on line 874 was always true

875 success, skip_reason = result 

876 else: 

877 success = result 

878 skip_reason = None 

879 

880 status = "success" if success else "skipped" 

881 if skip_reason and not success: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true

882 error_msg = skip_reason 

883 logger.info( 

884 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}" 

885 ) 

886 

887 logger.debug( 

888 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}" 

889 ) 

890 except Exception as e: 

891 # Roll back FIRST: the next loop iteration's 

892 # session.query(ResearchResource).get(...) at the 

893 # top of the for-body runs BEFORE the next 

894 # try/except, so a poisoned session here would 

895 # cascade into PendingRollbackError on the next 

896 # item before this handler ever runs again 

897 # (issue #3827). 

898 safe_rollback(session, "SSE download") 

899 # Log error but continue processing 

900 error_msg = str(e) 

901 error_type = type(e).__name__ 

902 logger.info( 

903 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}" 

904 ) 

905 # Check if this is a skip reason (not a real error) 

906 # Use error category + categorized message for user display 

907 if any( 

908 phrase in error_msg.lower() 

909 for phrase in [ 

910 "paywall", 

911 "subscription", 

912 "not available", 

913 "not found", 

914 "no free", 

915 "embargoed", 

916 "forbidden", 

917 "not accessible", 

918 ] 

919 ): 

920 status = "skipped" 

921 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}" 

922 elif any( 

923 phrase in error_msg.lower() 

924 for phrase in [ 

925 "failed to download", 

926 "could not", 

927 "invalid", 

928 "server", 

929 ] 

930 ): 

931 status = "failed" 

932 skip_reason = f"Download failed - {error_type}" 

933 else: 

934 status = "failed" 

935 skip_reason = ( 

936 f"Processing failed - {error_type}" 

937 ) 

938 success = False 

939 

940 # Ensure skip_reason is set if we have an error message 

941 if error_msg and not skip_reason: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true

942 skip_reason = f"Processing failed - {error_type}" 

943 logger.debug( 

944 f"Setting skip_reason from error_msg: {error_msg}" 

945 ) 

946 

947 # Send progress update 

948 update_data = { 

949 "progress": progress, 

950 "current": current, 

951 "total": total, 

952 "file": file_name, 

953 "status": status, 

954 } 

955 # Add skip reason if available 

956 if skip_reason: 

957 update_data["error"] = skip_reason 

958 logger.info( 

959 f"Sending skip reason to UI: {skip_reason}" 

960 ) 

961 

962 logger.info(f"Update data being sent: {update_data}") 

963 yield f"data: {json.dumps(update_data)}\n\n" 

964 

965 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n" 

966 finally: 

967 from ...utilities.resource_utils import safe_close 

968 

969 safe_close(download_service, "download service") 

970 

971 return Response( 

972 stream_with_context(generate()), 

973 mimetype="text/event-stream", 

974 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, 

975 ) 

976 

977 

978@library_bp.route("/api/research-list") 

979@login_required 

980def get_research_list(): 

981 """Get list of research sessions for dropdowns.""" 

982 username = session["username"] 

983 service = LibraryService(username) 

984 research_list = service.get_research_list_for_dropdown() 

985 return jsonify({"research": research_list}) 

986 

987 

988@library_bp.route("/api/sync-library", methods=["POST"]) 

989@login_required 

990def sync_library(): 

991 """Sync library database with filesystem.""" 

992 username = session["username"] 

993 service = LibraryService(username) 

994 stats = service.sync_library_with_filesystem() 

995 return jsonify(stats) 

996 

997 

998@library_bp.route("/api/mark-redownload", methods=["POST"]) 

999@login_required 

1000@require_json_body() 

1001def mark_for_redownload(): 

1002 """Mark documents for re-download.""" 

1003 username = session["username"] 

1004 service = LibraryService(username) 

1005 

1006 data = request.json 

1007 document_ids = data.get("document_ids", []) 

1008 

1009 if not document_ids: 

1010 return jsonify({"error": "No document IDs provided"}), 400 

1011 

1012 count = service.mark_for_redownload(document_ids) 

1013 return jsonify({"success": True, "marked": count}) 

1014 

1015 

1016@library_bp.route("/api/queue-all-undownloaded", methods=["POST"]) 

1017@login_required 

1018def queue_all_undownloaded(): 

1019 """Queue all articles that haven't been downloaded yet.""" 

1020 username = session["username"] 

1021 

1022 logger.info(f"queue_all_undownloaded called for user {username}") 

1023 

1024 with get_user_db_session(username) as db_session: 

1025 # Find all resources that don't have a completed download 

1026 undownloaded = ( 

1027 db_session.query(ResearchResource) 

1028 .outerjoin( 

1029 Document, 

1030 ( 

1031 (ResearchResource.id == Document.resource_id) 

1032 | (ResearchResource.document_id == Document.id) 

1033 ) 

1034 & (Document.status == "completed"), 

1035 ) 

1036 .filter(Document.id.is_(None)) 

1037 .all() 

1038 ) 

1039 

1040 logger.info(f"Found {len(undownloaded)} total undownloaded resources") 

1041 

1042 # Get user password for encrypted database access 

1043 user_password = get_authenticated_user_password(username) 

1044 

1045 resource_filter = ResourceFilter(username, user_password) 

1046 filter_results = resource_filter.filter_downloadable_resources( 

1047 undownloaded 

1048 ) 

1049 

1050 # Get detailed filtering summary 

1051 filter_summary = resource_filter.get_filter_summary(undownloaded) 

1052 skipped_info = resource_filter.get_skipped_resources_info(undownloaded) 

1053 

1054 logger.info(f"Filter results: {filter_summary.to_dict()}") 

1055 

1056 queued_count = 0 

1057 research_ids = set() 

1058 skipped_count = 0 

1059 

1060 # Convert filter_results to dict for O(1) lookup instead of O(n²) 

1061 filter_results_by_id = {r.resource_id: r for r in filter_results} 

1062 

1063 for resource in undownloaded: 

1064 # Check if resource passed the smart filter 

1065 filter_result = filter_results_by_id.get(resource.id) 

1066 

1067 if not filter_result or not filter_result.can_retry: 

1068 skipped_count += 1 

1069 if filter_result: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true

1070 logger.debug( 

1071 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}" 

1072 ) 

1073 else: 

1074 logger.debug( 

1075 f"Skipping resource {resource.id} - no filter result available" 

1076 ) 

1077 continue 

1078 

1079 # Check if it's downloadable using proper URL parsing 

1080 if not resource.url: 

1081 skipped_count += 1 

1082 continue 

1083 

1084 is_downloadable = is_downloadable_domain(resource.url) 

1085 

1086 # Log what we're checking 

1087 if resource.url and "pubmed" in resource.url.lower(): 1087 ↛ 1088line 1087 didn't jump to line 1088 because the condition on line 1087 was never true

1088 logger.info(f"Found PubMed URL: {resource.url[:100]}") 

1089 

1090 if not is_downloadable: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true

1091 skipped_count += 1 

1092 logger.debug( 

1093 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}" 

1094 ) 

1095 continue 

1096 

1097 # Check if already in queue (any status) 

1098 existing_queue = ( 

1099 db_session.query(LibraryDownloadQueue) 

1100 .filter_by(resource_id=resource.id) 

1101 .first() 

1102 ) 

1103 

1104 if existing_queue: 

1105 # If it exists but isn't pending, reset it to pending 

1106 if existing_queue.status != DocumentStatus.PENDING: 

1107 existing_queue.status = DocumentStatus.PENDING 

1108 existing_queue.completed_at = None 

1109 queued_count += 1 

1110 research_ids.add(resource.research_id) 

1111 logger.debug( 

1112 f"Reset queue entry for resource {resource.id} to pending" 

1113 ) 

1114 else: 

1115 # Already pending, still count it 

1116 queued_count += 1 

1117 research_ids.add(resource.research_id) 

1118 logger.debug( 

1119 f"Resource {resource.id} already pending in queue" 

1120 ) 

1121 else: 

1122 # Add new entry to queue 

1123 queue_entry = LibraryDownloadQueue( 

1124 resource_id=resource.id, 

1125 research_id=resource.research_id, 

1126 priority=0, 

1127 status=DocumentStatus.PENDING, 

1128 ) 

1129 db_session.add(queue_entry) 

1130 queued_count += 1 

1131 research_ids.add(resource.research_id) 

1132 logger.debug( 

1133 f"Added new queue entry for resource {resource.id}" 

1134 ) 

1135 

1136 db_session.commit() 

1137 

1138 logger.info( 

1139 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)" 

1140 ) 

1141 

1142 # Note: Removed synchronous download processing here to avoid blocking the HTTP request 

1143 # Downloads will be processed via the SSE streaming endpoint or background tasks 

1144 

1145 return jsonify( 

1146 { 

1147 "success": True, 

1148 "queued": queued_count, 

1149 "research_ids": list(research_ids), 

1150 "total_undownloaded": len(undownloaded), 

1151 "skipped": skipped_count, 

1152 "filter_summary": filter_summary.to_dict(), 

1153 "skipped_details": skipped_info, 

1154 } 

1155 ) 

1156 

1157 

1158@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"]) 

1159@login_required 

1160def get_research_sources(research_id): 

1161 """Get all sources for a research with snippets.""" 

1162 username = session["username"] 

1163 

1164 sources = [] 

1165 with get_user_db_session(username) as db_session: 

1166 # Get all resources for this research 

1167 resources = ( 

1168 db_session.query(ResearchResource) 

1169 .filter_by(research_id=research_id) 

1170 .order_by(ResearchResource.created_at) 

1171 .all() 

1172 ) 

1173 

1174 for idx, resource in enumerate(resources, 1): 

1175 # Check if document exists 

1176 document = get_document_for_resource(db_session, resource) 

1177 

1178 # Get domain from URL 

1179 domain = "" 

1180 if resource.url: 

1181 try: 

1182 from urllib.parse import urlparse 

1183 

1184 domain = urlparse(resource.url).hostname or "" 

1185 except (ValueError, AttributeError): 

1186 # urlparse can raise ValueError for malformed URLs 

1187 pass 

1188 

1189 source_data = { 

1190 "number": idx, 

1191 "resource_id": resource.id, 

1192 "url": resource.url, 

1193 "title": resource.title or f"Source {idx}", 

1194 "snippet": resource.content_preview or "", 

1195 "domain": domain, 

1196 "relevance_score": getattr(resource, "relevance_score", None), 

1197 "downloaded": False, 

1198 "document_id": None, 

1199 "file_type": None, 

1200 } 

1201 

1202 if document and document.status == "completed": 

1203 source_data.update( 

1204 { 

1205 "downloaded": True, 

1206 "document_id": document.id, 

1207 "file_type": document.file_type, 

1208 "download_date": document.created_at.isoformat() 

1209 if document.created_at 

1210 else None, 

1211 } 

1212 ) 

1213 

1214 sources.append(source_data) 

1215 

1216 return jsonify({"success": True, "sources": sources, "total": len(sources)}) 

1217 

1218 

1219@library_bp.route("/api/check-downloads", methods=["POST"]) 

1220@login_required 

1221@require_json_body() 

1222def check_downloads(): 

1223 """Check download status for a list of URLs.""" 

1224 username = session["username"] 

1225 data = request.json 

1226 research_id = data.get("research_id") 

1227 urls = data.get("urls", []) 

1228 

1229 if not research_id or not urls: 

1230 return jsonify({"error": "Missing research_id or urls"}), 400 

1231 

1232 download_status = {} 

1233 

1234 with get_user_db_session(username) as db_session: 

1235 # Get all resources for this research 

1236 resources = ( 

1237 db_session.query(ResearchResource) 

1238 .filter_by(research_id=research_id) 

1239 .filter(ResearchResource.url.in_(urls)) 

1240 .all() 

1241 ) 

1242 

1243 for resource in resources: 

1244 # Check if document exists 

1245 document = get_document_for_resource(db_session, resource) 

1246 

1247 if document and document.status == "completed": 

1248 download_status[resource.url] = { 

1249 "downloaded": True, 

1250 "document_id": document.id, 

1251 "file_path": document.file_path, 

1252 "file_type": document.file_type, 

1253 "title": document.title or resource.title, 

1254 } 

1255 else: 

1256 download_status[resource.url] = { 

1257 "downloaded": False, 

1258 "resource_id": resource.id, 

1259 } 

1260 

1261 return jsonify({"download_status": download_status}) 

1262 

1263 

1264@library_bp.route("/api/download-source", methods=["POST"]) 

1265@login_required 

1266@require_json_body() 

1267def download_source(): 

1268 """Download a single source from a research.""" 

1269 username = session["username"] 

1270 user_password = get_authenticated_user_password(username) 

1271 data = request.json 

1272 research_id = data.get("research_id") 

1273 url = data.get("url") 

1274 

1275 if not research_id or not url: 

1276 return jsonify({"error": "Missing research_id or url"}), 400 

1277 

1278 # Check if URL is downloadable 

1279 if not is_downloadable_domain(url): 

1280 return jsonify({"error": "URL is not from a downloadable domain"}), 400 

1281 

1282 with get_user_db_session(username) as db_session: 

1283 # Find the resource 

1284 resource = ( 

1285 db_session.query(ResearchResource) 

1286 .filter_by(research_id=research_id, url=url) 

1287 .first() 

1288 ) 

1289 

1290 if not resource: 

1291 return jsonify({"error": "Resource not found"}), 404 

1292 

1293 # Check if already downloaded 

1294 existing = get_document_for_resource(db_session, resource) 

1295 

1296 if existing and existing.status == "completed": 

1297 return jsonify( 

1298 { 

1299 "success": True, 

1300 "message": "Already downloaded", 

1301 "document_id": existing.id, 

1302 } 

1303 ) 

1304 

1305 # Add to download queue 

1306 queue_entry = ( 

1307 db_session.query(LibraryDownloadQueue) 

1308 .filter_by(resource_id=resource.id) 

1309 .first() 

1310 ) 

1311 

1312 if not queue_entry: 1312 ↛ 1313line 1312 didn't jump to line 1313 because the condition on line 1312 was never true

1313 queue_entry = LibraryDownloadQueue( 

1314 resource_id=resource.id, 

1315 research_id=resource.research_id, 

1316 priority=1, # Higher priority for manual downloads 

1317 status=DocumentStatus.PENDING, 

1318 ) 

1319 db_session.add(queue_entry) 

1320 else: 

1321 queue_entry.status = DocumentStatus.PENDING 

1322 queue_entry.priority = 1 

1323 

1324 db_session.commit() 

1325 

1326 # Start download immediately 

1327 with DownloadService(username, user_password) as service: 

1328 success, message = service.download_resource(resource.id) 

1329 

1330 if success: 

1331 return jsonify( 

1332 {"success": True, "message": "Download completed"} 

1333 ) 

1334 # Log internal message, but show only generic message to user 

1335 return jsonify({"success": False, "message": "Download failed"})