Coverage for src/local_deep_research/research_library/utils/__init__.py: 91%
114 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Shared utility functions for the Research Library."""
3import hashlib
4import os
5import subprocess
6import sys
7from pathlib import Path
8from typing import Optional
9from urllib.parse import urlparse
11from flask import jsonify
12from loguru import logger
14from ...config.paths import get_library_directory
15from ...database.models.library import Document, DocumentCollection
16from ...security.path_validator import PathValidator
19def is_downloadable_domain(url: str) -> bool:
20 """Check if URL is from a downloadable academic domain using proper URL parsing."""
21 try:
22 if not url:
23 return False
25 parsed = urlparse(url.lower())
26 hostname = parsed.hostname or ""
27 path = parsed.path or ""
28 query = parsed.query or ""
30 # Check for direct PDF files
31 if path.endswith(".pdf") or ".pdf?" in url.lower():
32 return True
34 # List of downloadable academic domains
35 downloadable_domains = [
36 "arxiv.org",
37 "biorxiv.org",
38 "medrxiv.org",
39 "ncbi.nlm.nih.gov",
40 "pubmed.ncbi.nlm.nih.gov",
41 "europepmc.org",
42 "semanticscholar.org",
43 "researchgate.net",
44 "academia.edu",
45 "sciencedirect.com",
46 "springer.com",
47 "nature.com",
48 "wiley.com",
49 "ieee.org",
50 "acm.org",
51 "plos.org",
52 "frontiersin.org",
53 "mdpi.com",
54 "acs.org",
55 "rsc.org",
56 "tandfonline.com",
57 "sagepub.com",
58 "oxford.com",
59 "cambridge.org",
60 "bmj.com",
61 "nejm.org",
62 "thelancet.com",
63 "jamanetwork.com",
64 "annals.org",
65 "ahajournals.org",
66 "cell.com",
67 "science.org",
68 "pnas.org",
69 "elifesciences.org",
70 "embopress.org",
71 "journals.asm.org",
72 "microbiologyresearch.org",
73 "jvi.asm.org",
74 "genome.cshlp.org",
75 "genetics.org",
76 "g3journal.org",
77 "plantphysiol.org",
78 "plantcell.org",
79 "aspb.org",
80 "bioone.org",
81 "company-of-biologists.org",
82 "biologists.org",
83 "jeb.biologists.org",
84 "dmm.biologists.org",
85 "bio.biologists.org",
86 "doi.org",
87 "ssrn.com",
88 "openreview.net",
89 ]
91 # Check if hostname matches any downloadable domain
92 for domain in downloadable_domains:
93 if hostname == domain or hostname.endswith("." + domain):
94 return True
96 # Special case for PubMed which might appear in path
97 if "pubmed" in hostname or "/pubmed/" in path: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 return True
100 # Check for PDF in path or query parameters
101 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query:
102 return True
104 return False
106 except Exception:
107 logger.warning(f"Error parsing URL {url}")
108 return False
111def is_downloadable_url(url: str) -> bool:
112 """Check if a URL is downloadable (academic domain or direct PDF link).
114 This is the single source of truth for downloadability checks.
115 Combines domain checking with PDF extension/path detection.
117 Args:
118 url: The URL to check
120 Returns:
121 True if the URL is from a downloadable academic domain or is a direct PDF link
122 """
123 return is_downloadable_domain(url)
126def get_document_for_resource(session, resource):
127 """Get Document for a ResearchResource.
129 Checks resource.document_id first (library resources point directly
130 to existing Documents), falls back to Document.resource_id lookup
131 (web downloads create Documents with resource_id set).
132 """
133 if resource.document_id:
134 return (
135 session.query(Document).filter_by(id=resource.document_id).first()
136 )
137 return session.query(Document).filter_by(resource_id=resource.id).first()
140def get_url_hash(url: str) -> str:
141 """
142 Generate a SHA256 hash of a URL.
144 Args:
145 url: The URL to hash
147 Returns:
148 The SHA256 hash of the URL
149 """
150 return hashlib.sha256(url.lower().encode()).hexdigest()
153def ensure_in_collection(
154 session, document_id: str, collection_id: str
155) -> "DocumentCollection":
156 """Get or create a DocumentCollection link between a document and a collection.
158 Args:
159 session: SQLAlchemy session
160 document_id: UUID of the document
161 collection_id: UUID of the collection
163 Returns:
164 The existing or newly created DocumentCollection row
165 """
166 existing = (
167 session.query(DocumentCollection)
168 .filter_by(document_id=document_id, collection_id=collection_id)
169 .first()
170 )
171 if existing:
172 return existing
174 doc_collection = DocumentCollection(
175 document_id=document_id,
176 collection_id=collection_id,
177 indexed=False,
178 )
179 session.add(doc_collection)
180 return doc_collection
183def get_library_storage_path(username: str) -> Path:
184 """
185 Get the storage path for a user's library.
187 Uses the settings system which respects environment variable overrides:
188 - research_library.storage_path: Base path for library storage
189 - research_library.shared_library: If true, all users share the same directory
191 Args:
192 username: The username
194 Returns:
195 Path to the library storage directory
196 """
197 from ...utilities.db_utils import get_settings_manager
199 settings = get_settings_manager()
201 # Get the base path from settings (uses centralized path, respects LDR_DATA_DIR)
202 base_path = (
203 Path(
204 settings.get_setting(
205 "research_library.storage_path",
206 str(get_library_directory()),
207 )
208 )
209 .expanduser()
210 .resolve()
211 )
213 # Check if shared library mode is enabled
214 shared_library = settings.get_setting(
215 "research_library.shared_library", False
216 )
218 if shared_library:
219 # Shared mode: all users use the same directory
220 base_path.mkdir(parents=True, exist_ok=True)
221 return base_path
222 # Default: user isolation with subdirectories
223 user_dir = base_path / username
224 user_dir.mkdir(parents=True, exist_ok=True)
225 return user_dir
228def open_file_location(file_path: str) -> bool:
229 """
230 Open the file location in the system file manager.
232 Args:
233 file_path: Path to the file
235 Returns:
236 True if successful, False otherwise
237 """
238 try:
239 # Validate path is safe (blocks system dirs, path traversal)
240 validated = PathValidator.validate_local_filesystem_path(file_path)
241 folder = str(validated.parent)
242 if sys.platform == "win32": 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true
243 os.startfile(folder)
244 elif sys.platform == "darwin": # macOS
245 result = subprocess.run(
246 ["open", folder], capture_output=True, text=True, shell=False
247 )
248 if result.returncode != 0: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 logger.error(f"Failed to open folder on macOS: {result.stderr}")
250 return False
251 else: # Linux
252 result = subprocess.run(
253 ["xdg-open", folder],
254 capture_output=True,
255 text=True,
256 shell=False,
257 )
258 if result.returncode != 0:
259 logger.error(f"Failed to open folder on Linux: {result.stderr}")
260 return False
261 return True
262 except Exception:
263 logger.exception("Failed to open file location")
264 return False
267def get_absolute_library_path(
268 relative_path: str, username: str
269) -> Optional[Path]:
270 """
271 Get the absolute path from a relative library path.
273 Uses PathValidator to prevent path traversal attacks.
275 Args:
276 relative_path: The relative path from library root
277 username: The username
279 Returns:
280 The absolute path, or None if the path is unsafe
281 """
282 library_root = get_library_storage_path(username)
283 try:
284 # Use PathValidator to prevent path traversal attacks
285 safe_path = PathValidator.validate_safe_path(
286 relative_path, str(library_root)
287 )
288 if safe_path is None: 288 ↛ 289line 288 didn't jump to line 289 because the condition on line 288 was never true
289 return None
290 result = Path(safe_path)
291 if result.is_symlink():
292 logger.warning(f"Symlink blocked: {relative_path}")
293 return None
294 return result
295 except ValueError:
296 logger.warning(f"Path traversal blocked: {relative_path}")
297 return None
300def get_absolute_path_from_settings(relative_path: str) -> Optional[Path]:
301 """
302 Get absolute path using settings manager for library root.
304 Uses PathValidator to prevent path traversal attacks.
306 Args:
307 relative_path: The relative path from library root
309 Returns:
310 The absolute path, or None if the path is unsafe
311 """
312 from ...utilities.db_utils import get_settings_manager
314 settings = get_settings_manager()
315 library_root = (
316 Path(
317 settings.get_setting(
318 "research_library.storage_path",
319 str(get_library_directory()),
320 )
321 )
322 .expanduser()
323 .resolve()
324 )
326 if not relative_path:
327 return library_root
329 try:
330 # Use PathValidator to prevent path traversal attacks
331 safe_path = PathValidator.validate_safe_path(
332 relative_path, str(library_root)
333 )
334 if safe_path is None: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true
335 return None
336 result = Path(safe_path)
337 if result.is_symlink():
338 logger.warning(f"Symlink blocked: {relative_path}")
339 return None
340 return result
341 except ValueError:
342 logger.warning(f"Path traversal blocked: {relative_path}")
343 return None
346def handle_api_error(operation: str, error: Exception, status_code: int = 500):
347 """
348 Handle API errors consistently - log internally, return generic message to user.
350 This prevents information exposure by logging full error details internally
351 while returning a generic message to the user.
353 Args:
354 operation: Description of the operation that failed (for logging)
355 error: The exception that occurred
356 status_code: HTTP status code to return (default: 500)
358 Returns:
359 Flask JSON response tuple (response, status_code)
360 """
361 # Log the full error internally with stack trace
362 logger.exception(f"Error during {operation}")
364 # Return generic message to user (no internal details exposed)
365 return jsonify(
366 {
367 "success": False,
368 "error": "An internal error occurred. Please try again or contact support.",
369 }
370 ), status_code