Coverage for src / local_deep_research / research_library / routes / library_routes.py: 94%

549 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Routes for Research Library and Download Manager 

3 

4Provides web endpoints for: 

5- Library browsing and management 

6- Download manager interface 

7- API endpoints for downloads and queries 

8""" 

9 

10import json 

11import math 

12from io import BytesIO 

13from pathlib import Path 

14from flask import ( 

15 Blueprint, 

16 g, 

17 jsonify, 

18 request, 

19 session, 

20 Response, 

21 send_file, 

22 stream_with_context, 

23) 

24from loguru import logger 

25 

26from ...security.decorators import require_json_body 

27from ...web.auth.decorators import login_required 

28from ...web.utils.templates import render_template_with_defaults 

29from ...database.session_context import get_user_db_session 

30from ...database.models.research import ResearchResource 

31from ...database.models.library import ( 

32 Document as Document, 

33 DocumentStatus, 

34 DownloadQueue as LibraryDownloadQueue, 

35 Collection, 

36) 

37from ...library.download_management import ResourceFilter 

38from ..services.download_service import DownloadService 

39from ..services.library_service import LibraryService 

40from ..services.pdf_storage_manager import PDFStorageManager 

41from ..utils import ( 

42 get_document_for_resource, 

43 handle_api_error, 

44 is_downloadable_domain, 

45 is_downloadable_url, 

46) 

47from ...utilities.db_utils import get_settings_manager 

48from ...config.paths import get_library_directory 

49from ...web.exceptions import AuthenticationRequiredError 

50 

51# Create Blueprint 

52library_bp = Blueprint("library", __name__, url_prefix="/library") 

53 

54# NOTE: Routes use session["username"] (not .get()) intentionally. 

55# @login_required guarantees the key exists; direct access fails fast 

56# if the decorator is ever removed. 

57 

58 

59# Error handler for authentication errors 

60@library_bp.errorhandler(Exception) 

61def handle_web_api_exception(error): 

62 """Handle WebAPIException and its subclasses.""" 

63 from ...web.exceptions import WebAPIException 

64 

65 if isinstance(error, WebAPIException): 

66 return jsonify(error.to_dict()), error.status_code 

67 # Re-raise other exceptions 

68 raise error 

69 

70 

71def get_authenticated_user_password( 

72 username: str, flask_session_id: str | None = None 

73) -> str: 

74 """ 

75 Get authenticated user password from session store with fallback to g.user_password. 

76 

77 Args: 

78 username: The username to get password for 

79 flask_session_id: Optional Flask session ID. If not provided, uses session.get("session_id") 

80 

81 Returns: 

82 str: The user's password 

83 

84 Raises: 

85 AuthenticationRequiredError: If no password is available for the user 

86 """ 

87 from ...database.session_passwords import session_password_store 

88 

89 session_id = flask_session_id or session.get("session_id") 

90 

91 # Try session password store first 

92 try: 

93 user_password = session_password_store.get_session_password( 

94 username, session_id 

95 ) 

96 if user_password: 

97 logger.debug( 

98 f"Retrieved user password from session store for user {username}" 

99 ) 

100 return user_password 

101 except Exception: 

102 logger.exception("Failed to get user password from session store") 

103 

104 # Fallback to g.user_password (set by middleware if temp_auth was used) 

105 user_password = getattr(g, "user_password", None) 

106 if user_password: 

107 logger.debug( 

108 f"Retrieved user password from g.user_password fallback for user {username}" 

109 ) 

110 return user_password 

111 

112 # No password available 

113 logger.error(f"No user password available for user {username}") 

114 raise AuthenticationRequiredError( 

115 message="Authentication required: Please refresh the page and log in again to access encrypted database features.", 

116 ) 

117 

118 

119# ============= Page Routes ============= 

120 

121 

122@library_bp.route("/") 

123@login_required 

124def library_page(): 

125 """Main library page showing downloaded documents.""" 

126 username = session["username"] 

127 service = LibraryService(username) 

128 

129 # Get library settings 

130 from ...utilities.db_utils import get_settings_manager 

131 

132 settings = get_settings_manager() 

133 pdf_storage_mode = settings.get_setting( 

134 "research_library.pdf_storage_mode", "database" 

135 ) 

136 # Enable PDF storage button if mode is not "none" 

137 enable_pdf_storage = pdf_storage_mode != "none" 

138 shared_library = settings.get_setting( 

139 "research_library.shared_library", False 

140 ) 

141 

142 # Get statistics 

143 stats = service.get_library_stats() 

144 

145 # Get documents with optional filters 

146 domain_filter = request.args.get("domain") 

147 research_filter = request.args.get("research") 

148 collection_filter = request.args.get("collection") # New collection filter 

149 date_filter = request.args.get("date") 

150 

151 # Resolve collection_id once to avoid redundant DB lookups 

152 from ...database.library_init import get_default_library_id 

153 

154 resolved_collection = collection_filter or get_default_library_id(username) 

155 

156 # Pagination 

157 per_page = 100 

158 total_docs = service.count_documents( 

159 research_id=research_filter, 

160 domain=domain_filter, 

161 collection_id=resolved_collection, 

162 date_filter=date_filter, 

163 ) 

164 total_pages = max(1, math.ceil(total_docs / per_page)) 

165 page = request.args.get("page", 1, type=int) 

166 page = max(1, min(page, total_pages)) 

167 offset = (page - 1) * per_page 

168 

169 documents = service.get_documents( 

170 research_id=research_filter, 

171 domain=domain_filter, 

172 collection_id=resolved_collection, 

173 date_filter=date_filter, 

174 limit=per_page, 

175 offset=offset, 

176 ) 

177 

178 # Get unique domains for filter dropdown 

179 unique_domains = service.get_unique_domains() 

180 

181 # Get research list for filter dropdown 

182 research_list = service.get_research_list_for_dropdown() 

183 

184 # Get collections list for filter dropdown 

185 collections = service.get_all_collections() 

186 

187 # Find default library collection ID for semantic search 

188 default_collection_id = next( 

189 (c["id"] for c in collections if c.get("is_default")), None 

190 ) 

191 

192 return render_template_with_defaults( 

193 "pages/library.html", 

194 stats=stats, 

195 documents=documents, 

196 unique_domains=unique_domains, 

197 research_list=research_list, 

198 collections=collections, 

199 selected_collection=collection_filter, 

200 default_collection_id=default_collection_id, 

201 storage_path=stats.get("storage_path", ""), 

202 enable_pdf_storage=enable_pdf_storage, 

203 pdf_storage_mode=pdf_storage_mode, 

204 shared_library=shared_library, 

205 page=page, 

206 total_pages=total_pages, 

207 selected_date=date_filter, 

208 selected_research=research_filter, 

209 selected_domain=domain_filter, 

210 ) 

211 

212 

213@library_bp.route("/document/<string:document_id>") 

214@login_required 

215def document_details_page(document_id): 

216 """Document details page showing all metadata and links.""" 

217 username = session["username"] 

218 service = LibraryService(username) 

219 

220 # Get document details 

221 document = service.get_document_by_id(document_id) 

222 

223 if not document: 

224 return "Document not found", 404 

225 

226 return render_template_with_defaults( 

227 "pages/document_details.html", document=document 

228 ) 

229 

230 

231@library_bp.route("/download-manager") 

232@login_required 

233def download_manager_page(): 

234 """Download manager page for selecting and downloading research PDFs.""" 

235 username = session["username"] 

236 service = LibraryService(username) 

237 

238 # Get library settings 

239 from ...utilities.db_utils import get_settings_manager 

240 

241 settings = get_settings_manager() 

242 pdf_storage_mode = settings.get_setting( 

243 "research_library.pdf_storage_mode", "database" 

244 ) 

245 # Enable PDF storage button if mode is not "none" 

246 enable_pdf_storage = pdf_storage_mode != "none" 

247 shared_library = settings.get_setting( 

248 "research_library.shared_library", False 

249 ) 

250 

251 # Summary stats over ALL sessions (also used for page count) 

252 per_page = 50 

253 summary = service.get_download_manager_summary_stats() 

254 total_pages = max(1, math.ceil(summary["total_researches"] / per_page)) 

255 

256 # Pagination with upper-bound clamp 

257 page = request.args.get("page", 1, type=int) 

258 page = max(1, min(page, total_pages)) 

259 offset = (page - 1) * per_page 

260 

261 # Get paginated research sessions 

262 research_list = service.get_research_list_with_stats( 

263 limit=per_page, offset=offset 

264 ) 

265 

266 # Batch-fetch PDF previews and domain breakdowns (single query) 

267 research_ids = [r["id"] for r in research_list] 

268 previews = service.get_pdf_previews_batch(research_ids) 

269 for research in research_list: 

270 rid = research["id"] 

271 data = previews.get(rid, {"pdf_sources": [], "domains": {}}) 

272 research["pdf_sources"] = data["pdf_sources"] 

273 research["domains"] = data["domains"] 

274 

275 return render_template_with_defaults( 

276 "pages/download_manager.html", 

277 research_list=research_list, 

278 total_researches=summary["total_researches"], 

279 total_resources=summary["total_resources"], 

280 already_downloaded=summary["already_downloaded"], 

281 available_to_download=summary["available_to_download"], 

282 enable_pdf_storage=enable_pdf_storage, 

283 pdf_storage_mode=pdf_storage_mode, 

284 shared_library=shared_library, 

285 page=page, 

286 total_pages=total_pages, 

287 ) 

288 

289 

290# ============= API Routes ============= 

291 

292 

293@library_bp.route("/api/stats") 

294@login_required 

295def get_library_stats(): 

296 """Get library statistics.""" 

297 username = session["username"] 

298 service = LibraryService(username) 

299 stats = service.get_library_stats() 

300 return jsonify(stats) 

301 

302 

303@library_bp.route("/api/collections/list") 

304@login_required 

305def get_collections_list(): 

306 """Get list of all collections for dropdown selection.""" 

307 username = session["username"] 

308 

309 with get_user_db_session(username) as db_session: 

310 collections = ( 

311 db_session.query(Collection).order_by(Collection.name).all() 

312 ) 

313 

314 return jsonify( 

315 { 

316 "success": True, 

317 "collections": [ 

318 { 

319 "id": col.id, 

320 "name": col.name, 

321 "description": col.description, 

322 } 

323 for col in collections 

324 ], 

325 } 

326 ) 

327 

328 

329@library_bp.route("/api/documents") 

330@login_required 

331def get_documents(): 

332 """Get documents with filtering.""" 

333 username = session["username"] 

334 service = LibraryService(username) 

335 

336 # Get filter parameters 

337 research_id = request.args.get("research_id") 

338 domain = request.args.get("domain") 

339 file_type = request.args.get("file_type") 

340 favorites_only = request.args.get("favorites") == "true" 

341 search_query = request.args.get("search") 

342 limit = int(request.args.get("limit", 100)) 

343 offset = int(request.args.get("offset", 0)) 

344 

345 documents = service.get_documents( 

346 research_id=research_id, 

347 domain=domain, 

348 file_type=file_type, 

349 favorites_only=favorites_only, 

350 search_query=search_query, 

351 limit=limit, 

352 offset=offset, 

353 ) 

354 

355 return jsonify({"documents": documents}) 

356 

357 

358@library_bp.route( 

359 "/api/document/<string:document_id>/favorite", methods=["POST"] 

360) 

361@login_required 

362def toggle_favorite(document_id): 

363 """Toggle favorite status of a document.""" 

364 username = session["username"] 

365 service = LibraryService(username) 

366 is_favorite = service.toggle_favorite(document_id) 

367 return jsonify({"favorite": is_favorite}) 

368 

369 

370@library_bp.route("/api/document/<string:document_id>", methods=["DELETE"]) 

371@login_required 

372def delete_document(document_id): 

373 """Delete a document from library.""" 

374 username = session["username"] 

375 service = LibraryService(username) 

376 success = service.delete_document(document_id) 

377 return jsonify({"success": success}) 

378 

379 

380@library_bp.route("/api/document/<string:document_id>/pdf-url") 

381@login_required 

382def get_pdf_url(document_id): 

383 """Get URL for viewing PDF.""" 

384 # Return URL that will serve the PDF 

385 return jsonify( 

386 { 

387 "url": f"/library/api/document/{document_id}/pdf", 

388 "title": "Document", # Could fetch actual title 

389 } 

390 ) 

391 

392 

393@library_bp.route("/document/<string:document_id>/pdf") 

394@login_required 

395def view_pdf_page(document_id): 

396 """Page for viewing PDF file - uses PDFStorageManager for retrieval.""" 

397 username = session["username"] 

398 

399 with get_user_db_session(username) as db_session: 

400 # Get document from database 

401 document = db_session.query(Document).filter_by(id=document_id).first() 

402 

403 if not document: 

404 logger.warning( 

405 f"Document ID {document_id} not found in database for user {username}" 

406 ) 

407 return "Document not found", 404 

408 

409 logger.info( 

410 f"Document {document_id}: title='{document.title}', " 

411 f"file_path={document.file_path}" 

412 ) 

413 

414 # Get settings for PDF storage manager 

415 settings = get_settings_manager(db_session) 

416 storage_mode = settings.get_setting( 

417 "research_library.pdf_storage_mode", "none" 

418 ) 

419 library_root = ( 

420 Path( 

421 settings.get_setting( 

422 "research_library.storage_path", 

423 str(get_library_directory()), 

424 ) 

425 ) 

426 .expanduser() 

427 .resolve() 

428 ) 

429 

430 # Use PDFStorageManager to load PDF (handles database and filesystem) 

431 pdf_manager = PDFStorageManager(library_root, storage_mode) 

432 pdf_bytes = pdf_manager.load_pdf(document, db_session) 

433 

434 if pdf_bytes: 

435 logger.info( 

436 f"Serving PDF for document {document_id} ({len(pdf_bytes)} bytes)" 

437 ) 

438 return send_file( 

439 BytesIO(pdf_bytes), 

440 mimetype="application/pdf", 

441 as_attachment=False, 

442 download_name=document.filename or "document.pdf", 

443 ) 

444 

445 # No PDF found anywhere 

446 logger.warning(f"No PDF available for document {document_id}") 

447 return "PDF not available", 404 

448 

449 

450@library_bp.route("/api/document/<string:document_id>/pdf") 

451@login_required 

452def serve_pdf_api(document_id): 

453 """API endpoint for serving PDF file (kept for backward compatibility).""" 

454 return view_pdf_page(document_id) 

455 

456 

457@library_bp.route("/document/<string:document_id>/txt") 

458@login_required 

459def view_text_page(document_id): 

460 """Page for viewing text content.""" 

461 username = session["username"] 

462 

463 with get_user_db_session(username) as db_session: 

464 # Get document by ID (text now stored in Document.text_content) 

465 document = db_session.query(Document).filter_by(id=document_id).first() 

466 

467 if not document: 

468 logger.warning(f"Document not found for document ID {document_id}") 

469 return "Document not found", 404 

470 

471 if not document.text_content: 

472 logger.warning(f"Document {document_id} has no text content") 

473 return "Text content not available", 404 

474 

475 logger.info( 

476 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

477 ) 

478 

479 # Render as HTML page 

480 return render_template_with_defaults( 

481 "pages/document_text.html", 

482 document_id=document_id, 

483 title=document.title or "Document Text", 

484 text_content=document.text_content, 

485 extraction_method=document.extraction_method, 

486 word_count=document.word_count, 

487 ) 

488 

489 

490@library_bp.route("/api/document/<string:document_id>/text") 

491@login_required 

492def serve_text_api(document_id): 

493 """API endpoint for serving text content (kept for backward compatibility).""" 

494 username = session["username"] 

495 

496 with get_user_db_session(username) as db_session: 

497 # Get document by ID (text now stored in Document.text_content) 

498 document = db_session.query(Document).filter_by(id=document_id).first() 

499 

500 if not document: 

501 logger.warning(f"Document not found for document ID {document_id}") 

502 return jsonify({"error": "Document not found"}), 404 

503 

504 if not document.text_content: 

505 logger.warning(f"Document {document_id} has no text content") 

506 return jsonify({"error": "Text content not available"}), 404 

507 

508 logger.info( 

509 f"Serving text content for document {document_id}: {len(document.text_content)} characters" 

510 ) 

511 

512 return jsonify( 

513 { 

514 "text_content": document.text_content, 

515 "title": document.title or "Document", 

516 "extraction_method": document.extraction_method, 

517 "word_count": document.word_count, 

518 } 

519 ) 

520 

521 

522@library_bp.route("/api/open-folder", methods=["POST"]) 

523@login_required 

524def open_folder(): 

525 """Open folder containing a document. 

526 

527 Security: This endpoint is disabled for server deployments. 

528 It only makes sense for desktop usage where the server and client are on the same machine. 

529 """ 

530 return jsonify( 

531 { 

532 "status": "error", 

533 "message": "This feature is disabled. It is only available in desktop mode.", 

534 } 

535 ), 403 

536 

537 

538@library_bp.route("/api/download/<int:resource_id>", methods=["POST"]) 

539@login_required 

540def download_single_resource(resource_id): 

541 """Download a single resource.""" 

542 username = session["username"] 

543 user_password = get_authenticated_user_password(username) 

544 

545 with DownloadService(username, user_password) as service: 

546 success, error = service.download_resource(resource_id) 

547 if success: 

548 return jsonify({"success": True}) 

549 logger.warning(f"Download failed for resource {resource_id}: {error}") 

550 return jsonify( 

551 { 

552 "success": False, 

553 "error": "Download failed. Please try again or contact support.", 

554 } 

555 ), 500 

556 

557 

558@library_bp.route("/api/download-text/<int:resource_id>", methods=["POST"]) 

559@login_required 

560def download_text_single(resource_id): 

561 """Download a single resource as text file.""" 

562 try: 

563 username = session["username"] 

564 user_password = get_authenticated_user_password(username) 

565 

566 with DownloadService(username, user_password) as service: 

567 success, error = service.download_as_text(resource_id) 

568 

569 # Sanitize error message - don't expose internal details 

570 if not success: 

571 if error: 571 ↛ 575line 571 didn't jump to line 575 because the condition on line 571 was always true

572 logger.warning( 

573 f"Download as text failed for resource {resource_id}: {error}" 

574 ) 

575 return jsonify( 

576 {"success": False, "error": "Failed to download resource"} 

577 ) 

578 

579 return jsonify({"success": True, "error": None}) 

580 except AuthenticationRequiredError: 

581 raise # Let blueprint error handler return 401 

582 except Exception as e: 

583 return handle_api_error( 

584 f"downloading resource {resource_id} as text", e 

585 ) 

586 

587 

588@library_bp.route("/api/download-all-text", methods=["POST"]) 

589@login_required 

590def download_all_text(): 

591 """Download all undownloaded resources as text files.""" 

592 username = session["username"] 

593 # Capture Flask session ID to avoid scoping issues in nested function 

594 flask_session_id = session.get("session_id") 

595 

596 def generate(): 

597 # Get user password for database operations 

598 try: 

599 user_password = get_authenticated_user_password( 

600 username, flask_session_id 

601 ) 

602 except AuthenticationRequiredError: 

603 logger.warning( 

604 f"Authentication unavailable for user {username} - password not in session store" 

605 ) 

606 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n" 

607 return 

608 

609 download_service = DownloadService(username, user_password) 

610 try: 

611 # Get all undownloaded resources 

612 with get_user_db_session(username) as session: 

613 # Get resources that don't have text files yet 

614 all_resources = session.query(ResearchResource).all() 

615 # Filter to only downloadable resources (academic/PDF) 

616 resources = [ 

617 r for r in all_resources if is_downloadable_url(r.url) 

618 ] 

619 

620 # Filter resources that need text extraction 

621 txt_path = Path(download_service.library_root) / "txt" 

622 resources_to_process = [] 

623 

624 # Pre-scan directory once to get all existing resource IDs 

625 existing_resource_ids = set() 

626 if txt_path.exists(): 626 ↛ 627line 626 didn't jump to line 627 because the condition on line 626 was never true

627 for txt_file in txt_path.glob("*.txt"): 

628 # Extract resource ID from filename pattern *_{id}.txt 

629 parts = txt_file.stem.rsplit("_", 1) 

630 if len(parts) == 2: 

631 try: 

632 existing_resource_ids.add(int(parts[1])) 

633 except ValueError: 

634 pass 

635 

636 for resource in resources: 

637 # Check if text file already exists using preloaded set 

638 if resource.id not in existing_resource_ids: 638 ↛ 636line 638 didn't jump to line 636 because the condition on line 638 was always true

639 resources_to_process.append(resource) 

640 

641 total = len(resources_to_process) 

642 current = 0 

643 

644 logger.info(f"Found {total} resources needing text extraction") 

645 

646 for resource in resources_to_process: 

647 current += 1 

648 progress = ( 

649 int((current / total) * 100) if total > 0 else 100 

650 ) 

651 

652 file_name = ( 

653 resource.title[:50] 

654 if resource 

655 else f"document_{current}.txt" 

656 ) 

657 

658 try: 

659 success, error = download_service.download_as_text( 

660 resource.id 

661 ) 

662 

663 if success: 

664 status = "success" 

665 error_msg = None 

666 else: 

667 status = "failed" 

668 error_msg = error or "Text extraction failed" 

669 

670 except Exception as e: 

671 logger.exception( 

672 f"Error extracting text for resource {resource.id}" 

673 ) 

674 status = "failed" 

675 error_msg = ( 

676 f"Text extraction failed - {type(e).__name__}" 

677 ) 

678 

679 # Send update 

680 update = { 

681 "progress": progress, 

682 "current": current, 

683 "total": total, 

684 "file": file_name, 

685 "url": resource.url, # Add the URL for UI display 

686 "status": status, 

687 "error": error_msg, 

688 } 

689 yield f"data: {json.dumps(update)}\n\n" 

690 

691 # Send completion 

692 yield f"data: {json.dumps({'complete': True, 'total': total})}\n\n" 

693 finally: 

694 from ...utilities.resource_utils import safe_close 

695 

696 safe_close(download_service, "download service") 

697 

698 return Response( 

699 stream_with_context(generate()), mimetype="text/event-stream" 

700 ) 

701 

702 

703@library_bp.route("/api/download-research/<research_id>", methods=["POST"]) 

704@login_required 

705def download_research_pdfs(research_id): 

706 """Queue all PDFs from a research session for download.""" 

707 username = session["username"] 

708 user_password = get_authenticated_user_password(username) 

709 

710 with DownloadService(username, user_password) as service: 

711 # Get optional collection_id from request body 

712 data = request.json or {} 

713 collection_id = data.get("collection_id") 

714 

715 queued = service.queue_research_downloads(research_id, collection_id) 

716 

717 # Start processing queue (in production, this would be a background task) 

718 # For now, we'll process synchronously 

719 # TODO: Integrate with existing queue processor 

720 

721 return jsonify({"success": True, "queued": queued}) 

722 

723 

724@library_bp.route("/api/download-bulk", methods=["POST"]) 

725@login_required 

726@require_json_body() 

727def download_bulk(): 

728 """Download PDFs or extract text from multiple research sessions.""" 

729 username = session["username"] 

730 data = request.json 

731 research_ids = data.get("research_ids", []) 

732 mode = data.get("mode", "pdf") # pdf or text_only 

733 collection_id = data.get( 

734 "collection_id" 

735 ) # Optional: target collection for downloads 

736 

737 if not research_ids: 

738 return jsonify({"error": "No research IDs provided"}), 400 

739 

740 # Capture Flask session ID to avoid scoping issues in nested function 

741 flask_session_id = session.get("session_id") 

742 

743 def generate(): 

744 """Generate progress updates as Server-Sent Events.""" 

745 # Get user password for database operations 

746 try: 

747 user_password = get_authenticated_user_password( 

748 username, flask_session_id 

749 ) 

750 except AuthenticationRequiredError: 

751 logger.warning( 

752 f"Authentication unavailable for user {username} - password not in session store" 

753 ) 

754 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': 0, 'error': 'Authentication required', 'complete': True})}\n\n" 

755 return 

756 

757 download_service = DownloadService(username, user_password) 

758 try: 

759 # Count total pending queue items across all research IDs 

760 total = 0 

761 current = 0 

762 

763 with get_user_db_session(username) as session: 

764 for research_id in research_ids: 

765 count = ( 

766 session.query(LibraryDownloadQueue) 

767 .filter_by( 

768 research_id=research_id, 

769 status=DocumentStatus.PENDING, 

770 ) 

771 .count() 

772 ) 

773 total += count 

774 logger.debug( 

775 f"[PROGRESS_DEBUG] Research {research_id}: {count} pending items in queue" 

776 ) 

777 

778 logger.info( 

779 f"[PROGRESS_DEBUG] Total pending downloads across all research: {total}" 

780 ) 

781 yield f"data: {json.dumps({'progress': 0, 'current': 0, 'total': total})}\n\n" 

782 

783 # Process each research 

784 for research_id in research_ids: 

785 # Get queued downloads for this research 

786 with get_user_db_session(username) as session: 

787 # Get pending queue items for this research 

788 queue_items = ( 

789 session.query(LibraryDownloadQueue) 

790 .filter_by( 

791 research_id=research_id, 

792 status=DocumentStatus.PENDING, 

793 ) 

794 .all() 

795 ) 

796 

797 # If no items queued yet, queue them now 

798 if not queue_items: 

799 try: 

800 download_service.queue_research_downloads( 

801 research_id, collection_id 

802 ) 

803 # Re-fetch queue items 

804 queue_items = ( 

805 session.query(LibraryDownloadQueue) 

806 .filter_by( 

807 research_id=research_id, status="pending" 

808 ) 

809 .all() 

810 ) 

811 except Exception: 

812 logger.exception( 

813 f"Error queueing downloads for research {research_id}" 

814 ) 

815 # Continue with empty queue_items 

816 queue_items = [] 

817 

818 # Process each queued item 

819 for queue_item in queue_items: 

820 logger.debug( 

821 f"[PROGRESS_DEBUG] Before increment: current={current} (type: {type(current)}), total={total} (type: {type(total)})" 

822 ) 

823 current += 1 

824 logger.debug( 

825 f"[PROGRESS_DEBUG] After increment: current={current} (type: {type(current)})" 

826 ) 

827 

828 # Check for division issues 

829 if total is None: 829 ↛ 830line 829 didn't jump to line 830 because the condition on line 829 was never true

830 logger.error( 

831 "[PROGRESS_DEBUG] ERROR: total is None! Setting to 0 to avoid crash" 

832 ) 

833 total = 0 

834 

835 progress = ( 

836 int((current / total) * 100) if total > 0 else 100 

837 ) 

838 logger.debug( 

839 f"[PROGRESS_DEBUG] Calculated progress: {progress}%" 

840 ) 

841 

842 # Get resource info 

843 resource = session.query(ResearchResource).get( 

844 queue_item.resource_id 

845 ) 

846 file_name = ( 

847 resource.title[:50] 

848 if resource 

849 else f"document_{current}.pdf" 

850 ) 

851 

852 # Attempt actual download with error handling 

853 skip_reason = None 

854 status = "skipped" # Default to skipped 

855 success = False 

856 error_msg = None 

857 

858 try: 

859 logger.debug( 

860 f"Attempting {'PDF download' if mode == 'pdf' else 'text extraction'} for resource {queue_item.resource_id}" 

861 ) 

862 

863 # Call appropriate service method based on mode 

864 if mode == "pdf": 

865 result = download_service.download_resource( 

866 queue_item.resource_id 

867 ) 

868 else: # text_only 

869 result = download_service.download_as_text( 

870 queue_item.resource_id 

871 ) 

872 

873 # Handle new tuple return format 

874 if isinstance(result, tuple): 874 ↛ 877line 874 didn't jump to line 877 because the condition on line 874 was always true

875 success, skip_reason = result 

876 else: 

877 success = result 

878 skip_reason = None 

879 

880 status = "success" if success else "skipped" 

881 if skip_reason and not success: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true

882 error_msg = skip_reason 

883 logger.info( 

884 f"{'Download' if mode == 'pdf' else 'Text extraction'} skipped for resource {queue_item.resource_id}: {skip_reason}" 

885 ) 

886 

887 logger.debug( 

888 f"{'Download' if mode == 'pdf' else 'Text extraction'} result: success={success}, status={status}, skip_reason={skip_reason}" 

889 ) 

890 except Exception as e: 

891 # Log error but continue processing 

892 error_msg = str(e) 

893 error_type = type(e).__name__ 

894 logger.info( 

895 f"CAUGHT Download exception for resource {queue_item.resource_id}: {error_type}: {error_msg}" 

896 ) 

897 # Check if this is a skip reason (not a real error) 

898 # Use error category + categorized message for user display 

899 if any( 

900 phrase in error_msg.lower() 

901 for phrase in [ 

902 "paywall", 

903 "subscription", 

904 "not available", 

905 "not found", 

906 "no free", 

907 "embargoed", 

908 "forbidden", 

909 "not accessible", 

910 ] 

911 ): 

912 status = "skipped" 

913 skip_reason = f"Document not accessible (paywall or access restriction) - {error_type}" 

914 elif any( 

915 phrase in error_msg.lower() 

916 for phrase in [ 

917 "failed to download", 

918 "could not", 

919 "invalid", 

920 "server", 

921 ] 

922 ): 

923 status = "failed" 

924 skip_reason = f"Download failed - {error_type}" 

925 else: 

926 status = "failed" 

927 skip_reason = ( 

928 f"Processing failed - {error_type}" 

929 ) 

930 success = False 

931 

932 # Ensure skip_reason is set if we have an error message 

933 if error_msg and not skip_reason: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true

934 skip_reason = f"Processing failed - {error_type}" 

935 logger.debug( 

936 f"Setting skip_reason from error_msg: {error_msg}" 

937 ) 

938 

939 # Send progress update 

940 update_data = { 

941 "progress": progress, 

942 "current": current, 

943 "total": total, 

944 "file": file_name, 

945 "status": status, 

946 } 

947 # Add skip reason if available 

948 if skip_reason: 

949 update_data["error"] = skip_reason 

950 logger.info( 

951 f"Sending skip reason to UI: {skip_reason}" 

952 ) 

953 

954 logger.info(f"Update data being sent: {update_data}") 

955 yield f"data: {json.dumps(update_data)}\n\n" 

956 

957 yield f"data: {json.dumps({'progress': 100, 'current': total, 'total': total, 'complete': True})}\n\n" 

958 finally: 

959 from ...utilities.resource_utils import safe_close 

960 

961 safe_close(download_service, "download service") 

962 

963 return Response( 

964 stream_with_context(generate()), 

965 mimetype="text/event-stream", 

966 headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, 

967 ) 

968 

969 

970@library_bp.route("/api/research-list") 

971@login_required 

972def get_research_list(): 

973 """Get list of research sessions for dropdowns.""" 

974 username = session["username"] 

975 service = LibraryService(username) 

976 research_list = service.get_research_list_for_dropdown() 

977 return jsonify({"research": research_list}) 

978 

979 

980@library_bp.route("/api/sync-library", methods=["POST"]) 

981@login_required 

982def sync_library(): 

983 """Sync library database with filesystem.""" 

984 username = session["username"] 

985 service = LibraryService(username) 

986 stats = service.sync_library_with_filesystem() 

987 return jsonify(stats) 

988 

989 

990@library_bp.route("/api/mark-redownload", methods=["POST"]) 

991@login_required 

992@require_json_body() 

993def mark_for_redownload(): 

994 """Mark documents for re-download.""" 

995 username = session["username"] 

996 service = LibraryService(username) 

997 

998 data = request.json 

999 document_ids = data.get("document_ids", []) 

1000 

1001 if not document_ids: 

1002 return jsonify({"error": "No document IDs provided"}), 400 

1003 

1004 count = service.mark_for_redownload(document_ids) 

1005 return jsonify({"success": True, "marked": count}) 

1006 

1007 

1008@library_bp.route("/api/queue-all-undownloaded", methods=["POST"]) 

1009@login_required 

1010def queue_all_undownloaded(): 

1011 """Queue all articles that haven't been downloaded yet.""" 

1012 username = session["username"] 

1013 

1014 logger.info(f"queue_all_undownloaded called for user {username}") 

1015 

1016 with get_user_db_session(username) as db_session: 

1017 # Find all resources that don't have a completed download 

1018 undownloaded = ( 

1019 db_session.query(ResearchResource) 

1020 .outerjoin( 

1021 Document, 

1022 ( 

1023 (ResearchResource.id == Document.resource_id) 

1024 | (ResearchResource.document_id == Document.id) 

1025 ) 

1026 & (Document.status == "completed"), 

1027 ) 

1028 .filter(Document.id.is_(None)) 

1029 .all() 

1030 ) 

1031 

1032 logger.info(f"Found {len(undownloaded)} total undownloaded resources") 

1033 

1034 # Get user password for encrypted database access 

1035 user_password = get_authenticated_user_password(username) 

1036 

1037 resource_filter = ResourceFilter(username, user_password) 

1038 filter_results = resource_filter.filter_downloadable_resources( 

1039 undownloaded 

1040 ) 

1041 

1042 # Get detailed filtering summary 

1043 filter_summary = resource_filter.get_filter_summary(undownloaded) 

1044 skipped_info = resource_filter.get_skipped_resources_info(undownloaded) 

1045 

1046 logger.info(f"Filter results: {filter_summary.to_dict()}") 

1047 

1048 queued_count = 0 

1049 research_ids = set() 

1050 skipped_count = 0 

1051 

1052 # Convert filter_results to dict for O(1) lookup instead of O(n²) 

1053 filter_results_by_id = {r.resource_id: r for r in filter_results} 

1054 

1055 for resource in undownloaded: 

1056 # Check if resource passed the smart filter 

1057 filter_result = filter_results_by_id.get(resource.id) 

1058 

1059 if not filter_result or not filter_result.can_retry: 

1060 skipped_count += 1 

1061 if filter_result: 1061 ↛ 1062line 1061 didn't jump to line 1062 because the condition on line 1061 was never true

1062 logger.debug( 

1063 f"Skipping resource {resource.id} due to retry policy: {filter_result.reason}" 

1064 ) 

1065 else: 

1066 logger.debug( 

1067 f"Skipping resource {resource.id} - no filter result available" 

1068 ) 

1069 continue 

1070 

1071 # Check if it's downloadable using proper URL parsing 

1072 if not resource.url: 

1073 skipped_count += 1 

1074 continue 

1075 

1076 is_downloadable = is_downloadable_domain(resource.url) 

1077 

1078 # Log what we're checking 

1079 if resource.url and "pubmed" in resource.url.lower(): 1079 ↛ 1080line 1079 didn't jump to line 1080 because the condition on line 1079 was never true

1080 logger.info(f"Found PubMed URL: {resource.url[:100]}") 

1081 

1082 if not is_downloadable: 1082 ↛ 1083line 1082 didn't jump to line 1083 because the condition on line 1082 was never true

1083 skipped_count += 1 

1084 logger.debug( 

1085 f"Skipping non-downloadable URL: {resource.url[:100] if resource.url else 'None'}" 

1086 ) 

1087 continue 

1088 

1089 # Check if already in queue (any status) 

1090 existing_queue = ( 

1091 db_session.query(LibraryDownloadQueue) 

1092 .filter_by(resource_id=resource.id) 

1093 .first() 

1094 ) 

1095 

1096 if existing_queue: 

1097 # If it exists but isn't pending, reset it to pending 

1098 if existing_queue.status != DocumentStatus.PENDING: 

1099 existing_queue.status = DocumentStatus.PENDING 

1100 existing_queue.completed_at = None 

1101 queued_count += 1 

1102 research_ids.add(resource.research_id) 

1103 logger.debug( 

1104 f"Reset queue entry for resource {resource.id} to pending" 

1105 ) 

1106 else: 

1107 # Already pending, still count it 

1108 queued_count += 1 

1109 research_ids.add(resource.research_id) 

1110 logger.debug( 

1111 f"Resource {resource.id} already pending in queue" 

1112 ) 

1113 else: 

1114 # Add new entry to queue 

1115 queue_entry = LibraryDownloadQueue( 

1116 resource_id=resource.id, 

1117 research_id=resource.research_id, 

1118 priority=0, 

1119 status=DocumentStatus.PENDING, 

1120 ) 

1121 db_session.add(queue_entry) 

1122 queued_count += 1 

1123 research_ids.add(resource.research_id) 

1124 logger.debug( 

1125 f"Added new queue entry for resource {resource.id}" 

1126 ) 

1127 

1128 db_session.commit() 

1129 

1130 logger.info( 

1131 f"Queued {queued_count} articles for download, skipped {skipped_count} resources (including {filter_summary.permanently_failed_count} permanently failed and {filter_summary.temporarily_failed_count} temporarily failed)" 

1132 ) 

1133 

1134 # Note: Removed synchronous download processing here to avoid blocking the HTTP request 

1135 # Downloads will be processed via the SSE streaming endpoint or background tasks 

1136 

1137 return jsonify( 

1138 { 

1139 "success": True, 

1140 "queued": queued_count, 

1141 "research_ids": list(research_ids), 

1142 "total_undownloaded": len(undownloaded), 

1143 "skipped": skipped_count, 

1144 "filter_summary": filter_summary.to_dict(), 

1145 "skipped_details": skipped_info, 

1146 } 

1147 ) 

1148 

1149 

1150@library_bp.route("/api/get-research-sources/<research_id>", methods=["GET"]) 

1151@login_required 

1152def get_research_sources(research_id): 

1153 """Get all sources for a research with snippets.""" 

1154 username = session["username"] 

1155 

1156 sources = [] 

1157 with get_user_db_session(username) as db_session: 

1158 # Get all resources for this research 

1159 resources = ( 

1160 db_session.query(ResearchResource) 

1161 .filter_by(research_id=research_id) 

1162 .order_by(ResearchResource.created_at) 

1163 .all() 

1164 ) 

1165 

1166 for idx, resource in enumerate(resources, 1): 

1167 # Check if document exists 

1168 document = get_document_for_resource(db_session, resource) 

1169 

1170 # Get domain from URL 

1171 domain = "" 

1172 if resource.url: 

1173 try: 

1174 from urllib.parse import urlparse 

1175 

1176 domain = urlparse(resource.url).hostname or "" 

1177 except (ValueError, AttributeError): 

1178 # urlparse can raise ValueError for malformed URLs 

1179 pass 

1180 

1181 source_data = { 

1182 "number": idx, 

1183 "resource_id": resource.id, 

1184 "url": resource.url, 

1185 "title": resource.title or f"Source {idx}", 

1186 "snippet": resource.content_preview or "", 

1187 "domain": domain, 

1188 "relevance_score": getattr(resource, "relevance_score", None), 

1189 "downloaded": False, 

1190 "document_id": None, 

1191 "file_type": None, 

1192 } 

1193 

1194 if document and document.status == "completed": 

1195 source_data.update( 

1196 { 

1197 "downloaded": True, 

1198 "document_id": document.id, 

1199 "file_type": document.file_type, 

1200 "download_date": document.created_at.isoformat() 

1201 if document.created_at 

1202 else None, 

1203 } 

1204 ) 

1205 

1206 sources.append(source_data) 

1207 

1208 return jsonify({"success": True, "sources": sources, "total": len(sources)}) 

1209 

1210 

1211@library_bp.route("/api/check-downloads", methods=["POST"]) 

1212@login_required 

1213@require_json_body() 

1214def check_downloads(): 

1215 """Check download status for a list of URLs.""" 

1216 username = session["username"] 

1217 data = request.json 

1218 research_id = data.get("research_id") 

1219 urls = data.get("urls", []) 

1220 

1221 if not research_id or not urls: 

1222 return jsonify({"error": "Missing research_id or urls"}), 400 

1223 

1224 download_status = {} 

1225 

1226 with get_user_db_session(username) as db_session: 

1227 # Get all resources for this research 

1228 resources = ( 

1229 db_session.query(ResearchResource) 

1230 .filter_by(research_id=research_id) 

1231 .filter(ResearchResource.url.in_(urls)) 

1232 .all() 

1233 ) 

1234 

1235 for resource in resources: 

1236 # Check if document exists 

1237 document = get_document_for_resource(db_session, resource) 

1238 

1239 if document and document.status == "completed": 

1240 download_status[resource.url] = { 

1241 "downloaded": True, 

1242 "document_id": document.id, 

1243 "file_path": document.file_path, 

1244 "file_type": document.file_type, 

1245 "title": document.title or resource.title, 

1246 } 

1247 else: 

1248 download_status[resource.url] = { 

1249 "downloaded": False, 

1250 "resource_id": resource.id, 

1251 } 

1252 

1253 return jsonify({"download_status": download_status}) 

1254 

1255 

1256@library_bp.route("/api/download-source", methods=["POST"]) 

1257@login_required 

1258@require_json_body() 

1259def download_source(): 

1260 """Download a single source from a research.""" 

1261 username = session["username"] 

1262 user_password = get_authenticated_user_password(username) 

1263 data = request.json 

1264 research_id = data.get("research_id") 

1265 url = data.get("url") 

1266 

1267 if not research_id or not url: 

1268 return jsonify({"error": "Missing research_id or url"}), 400 

1269 

1270 # Check if URL is downloadable 

1271 if not is_downloadable_domain(url): 

1272 return jsonify({"error": "URL is not from a downloadable domain"}), 400 

1273 

1274 with get_user_db_session(username) as db_session: 

1275 # Find the resource 

1276 resource = ( 

1277 db_session.query(ResearchResource) 

1278 .filter_by(research_id=research_id, url=url) 

1279 .first() 

1280 ) 

1281 

1282 if not resource: 

1283 return jsonify({"error": "Resource not found"}), 404 

1284 

1285 # Check if already downloaded 

1286 existing = get_document_for_resource(db_session, resource) 

1287 

1288 if existing and existing.status == "completed": 

1289 return jsonify( 

1290 { 

1291 "success": True, 

1292 "message": "Already downloaded", 

1293 "document_id": existing.id, 

1294 } 

1295 ) 

1296 

1297 # Add to download queue 

1298 queue_entry = ( 

1299 db_session.query(LibraryDownloadQueue) 

1300 .filter_by(resource_id=resource.id) 

1301 .first() 

1302 ) 

1303 

1304 if not queue_entry: 1304 ↛ 1305line 1304 didn't jump to line 1305 because the condition on line 1304 was never true

1305 queue_entry = LibraryDownloadQueue( 

1306 resource_id=resource.id, 

1307 research_id=resource.research_id, 

1308 priority=1, # Higher priority for manual downloads 

1309 status=DocumentStatus.PENDING, 

1310 ) 

1311 db_session.add(queue_entry) 

1312 else: 

1313 queue_entry.status = DocumentStatus.PENDING 

1314 queue_entry.priority = 1 

1315 

1316 db_session.commit() 

1317 

1318 # Start download immediately 

1319 with DownloadService(username, user_password) as service: 

1320 success, message = service.download_resource(resource.id) 

1321 

1322 if success: 

1323 return jsonify( 

1324 {"success": True, "message": "Download completed"} 

1325 ) 

1326 # Log internal message, but show only generic message to user 

1327 return jsonify({"success": False, "message": "Download failed"})