Coverage for src / local_deep_research / research_library / utils / __init__.py: 90%
107 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Shared utility functions for the Research Library."""
3import hashlib
4import os
5import subprocess
6import sys
7from pathlib import Path
8from typing import Optional
9from urllib.parse import urlparse
11from flask import jsonify
12from loguru import logger
14from ...config.paths import get_library_directory
15from ...database.models.library import Document
16from ...security.path_validator import PathValidator
19def is_downloadable_domain(url: str) -> bool:
20 """Check if URL is from a downloadable academic domain using proper URL parsing."""
21 try:
22 if not url:
23 return False
25 parsed = urlparse(url.lower())
26 hostname = parsed.hostname or ""
27 path = parsed.path or ""
28 query = parsed.query or ""
30 # Check for direct PDF files
31 if path.endswith(".pdf") or ".pdf?" in url.lower():
32 return True
34 # List of downloadable academic domains
35 downloadable_domains = [
36 "arxiv.org",
37 "biorxiv.org",
38 "medrxiv.org",
39 "ncbi.nlm.nih.gov",
40 "pubmed.ncbi.nlm.nih.gov",
41 "europepmc.org",
42 "semanticscholar.org",
43 "researchgate.net",
44 "academia.edu",
45 "sciencedirect.com",
46 "springer.com",
47 "nature.com",
48 "wiley.com",
49 "ieee.org",
50 "acm.org",
51 "plos.org",
52 "frontiersin.org",
53 "mdpi.com",
54 "acs.org",
55 "rsc.org",
56 "tandfonline.com",
57 "sagepub.com",
58 "oxford.com",
59 "cambridge.org",
60 "bmj.com",
61 "nejm.org",
62 "thelancet.com",
63 "jamanetwork.com",
64 "annals.org",
65 "ahajournals.org",
66 "cell.com",
67 "science.org",
68 "pnas.org",
69 "elifesciences.org",
70 "embopress.org",
71 "journals.asm.org",
72 "microbiologyresearch.org",
73 "jvi.asm.org",
74 "genome.cshlp.org",
75 "genetics.org",
76 "g3journal.org",
77 "plantphysiol.org",
78 "plantcell.org",
79 "aspb.org",
80 "bioone.org",
81 "company-of-biologists.org",
82 "biologists.org",
83 "jeb.biologists.org",
84 "dmm.biologists.org",
85 "bio.biologists.org",
86 "doi.org",
87 "ssrn.com",
88 "openreview.net",
89 ]
91 # Check if hostname matches any downloadable domain
92 for domain in downloadable_domains:
93 if hostname == domain or hostname.endswith("." + domain):
94 return True
96 # Special case for PubMed which might appear in path
97 if "pubmed" in hostname or "/pubmed/" in path: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 return True
100 # Check for PDF in path or query parameters
101 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query:
102 return True
104 return False
106 except Exception:
107 logger.warning(f"Error parsing URL {url}")
108 return False
111def is_downloadable_url(url: str) -> bool:
112 """Check if a URL is downloadable (academic domain or direct PDF link).
114 This is the single source of truth for downloadability checks.
115 Combines domain checking with PDF extension/path detection.
117 Args:
118 url: The URL to check
120 Returns:
121 True if the URL is from a downloadable academic domain or is a direct PDF link
122 """
123 return is_downloadable_domain(url)
126def get_document_for_resource(session, resource):
127 """Get Document for a ResearchResource.
129 Checks resource.document_id first (library resources point directly
130 to existing Documents), falls back to Document.resource_id lookup
131 (web downloads create Documents with resource_id set).
132 """
133 if resource.document_id:
134 return (
135 session.query(Document).filter_by(id=resource.document_id).first()
136 )
137 return session.query(Document).filter_by(resource_id=resource.id).first()
140def get_url_hash(url: str) -> str:
141 """
142 Generate a SHA256 hash of a URL.
144 Args:
145 url: The URL to hash
147 Returns:
148 The SHA256 hash of the URL
149 """
150 return hashlib.sha256(url.lower().encode()).hexdigest()
153def get_library_storage_path(username: str) -> Path:
154 """
155 Get the storage path for a user's library.
157 Uses the settings system which respects environment variable overrides:
158 - research_library.storage_path: Base path for library storage
159 - research_library.shared_library: If true, all users share the same directory
161 Args:
162 username: The username
164 Returns:
165 Path to the library storage directory
166 """
167 from ...utilities.db_utils import get_settings_manager
169 settings = get_settings_manager()
171 # Get the base path from settings (uses centralized path, respects LDR_DATA_DIR)
172 base_path = (
173 Path(
174 settings.get_setting(
175 "research_library.storage_path",
176 str(get_library_directory()),
177 )
178 )
179 .expanduser()
180 .resolve()
181 )
183 # Check if shared library mode is enabled
184 shared_library = settings.get_setting(
185 "research_library.shared_library", False
186 )
188 if shared_library:
189 # Shared mode: all users use the same directory
190 base_path.mkdir(parents=True, exist_ok=True)
191 return base_path
192 # Default: user isolation with subdirectories
193 user_dir = base_path / username
194 user_dir.mkdir(parents=True, exist_ok=True)
195 return user_dir
198def open_file_location(file_path: str) -> bool:
199 """
200 Open the file location in the system file manager.
202 Args:
203 file_path: Path to the file
205 Returns:
206 True if successful, False otherwise
207 """
208 try:
209 # Validate path is safe (blocks system dirs, path traversal)
210 validated = PathValidator.validate_local_filesystem_path(file_path)
211 folder = str(validated.parent)
212 if sys.platform == "win32": 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 os.startfile(folder)
214 elif sys.platform == "darwin": # macOS
215 result = subprocess.run(
216 ["open", folder], capture_output=True, text=True, shell=False
217 )
218 if result.returncode != 0: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true
219 logger.error(f"Failed to open folder on macOS: {result.stderr}")
220 return False
221 else: # Linux
222 result = subprocess.run(
223 ["xdg-open", folder],
224 capture_output=True,
225 text=True,
226 shell=False,
227 )
228 if result.returncode != 0:
229 logger.error(f"Failed to open folder on Linux: {result.stderr}")
230 return False
231 return True
232 except Exception:
233 logger.exception("Failed to open file location")
234 return False
237def get_absolute_library_path(
238 relative_path: str, username: str
239) -> Optional[Path]:
240 """
241 Get the absolute path from a relative library path.
243 Uses PathValidator to prevent path traversal attacks.
245 Args:
246 relative_path: The relative path from library root
247 username: The username
249 Returns:
250 The absolute path, or None if the path is unsafe
251 """
252 library_root = get_library_storage_path(username)
253 try:
254 # Use PathValidator to prevent path traversal attacks
255 safe_path = PathValidator.validate_safe_path(
256 relative_path, str(library_root)
257 )
258 if safe_path is None: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 return None
260 result = Path(safe_path)
261 if result.is_symlink():
262 logger.warning(f"Symlink blocked: {relative_path}")
263 return None
264 return result
265 except ValueError:
266 logger.warning(f"Path traversal blocked: {relative_path}")
267 return None
270def get_absolute_path_from_settings(relative_path: str) -> Optional[Path]:
271 """
272 Get absolute path using settings manager for library root.
274 Uses PathValidator to prevent path traversal attacks.
276 Args:
277 relative_path: The relative path from library root
279 Returns:
280 The absolute path, or None if the path is unsafe
281 """
282 from ...utilities.db_utils import get_settings_manager
284 settings = get_settings_manager()
285 library_root = (
286 Path(
287 settings.get_setting(
288 "research_library.storage_path",
289 str(get_library_directory()),
290 )
291 )
292 .expanduser()
293 .resolve()
294 )
296 if not relative_path:
297 return library_root
299 try:
300 # Use PathValidator to prevent path traversal attacks
301 safe_path = PathValidator.validate_safe_path(
302 relative_path, str(library_root)
303 )
304 if safe_path is None: 304 ↛ 305line 304 didn't jump to line 305 because the condition on line 304 was never true
305 return None
306 result = Path(safe_path)
307 if result.is_symlink():
308 logger.warning(f"Symlink blocked: {relative_path}")
309 return None
310 return result
311 except ValueError:
312 logger.warning(f"Path traversal blocked: {relative_path}")
313 return None
316def handle_api_error(operation: str, error: Exception, status_code: int = 500):
317 """
318 Handle API errors consistently - log internally, return generic message to user.
320 This prevents information exposure by logging full error details internally
321 while returning a generic message to the user.
323 Args:
324 operation: Description of the operation that failed (for logging)
325 error: The exception that occurred
326 status_code: HTTP status code to return (default: 500)
328 Returns:
329 Flask JSON response tuple (response, status_code)
330 """
331 # Log the full error internally with stack trace
332 logger.exception(f"Error during {operation}")
334 # Return generic message to user (no internal details exposed)
335 return jsonify(
336 {
337 "success": False,
338 "error": "An internal error occurred. Please try again or contact support.",
339 }
340 ), status_code