Coverage for src/local_deep_research/research_library/utils/__init__.py: 91%

114 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Shared utility functions for the Research Library.""" 

2 

3import hashlib 

4import os 

5import subprocess 

6import sys 

7from pathlib import Path 

8from typing import Optional 

9from urllib.parse import urlparse 

10 

11from flask import jsonify 

12from loguru import logger 

13 

14from ...config.paths import get_library_directory 

15from ...database.models.library import Document, DocumentCollection 

16from ...security.path_validator import PathValidator 

17 

18 

19def is_downloadable_domain(url: str) -> bool: 

20 """Check if URL is from a downloadable academic domain using proper URL parsing.""" 

21 try: 

22 if not url: 

23 return False 

24 

25 parsed = urlparse(url.lower()) 

26 hostname = parsed.hostname or "" 

27 path = parsed.path or "" 

28 query = parsed.query or "" 

29 

30 # Check for direct PDF files 

31 if path.endswith(".pdf") or ".pdf?" in url.lower(): 

32 return True 

33 

34 # List of downloadable academic domains 

35 downloadable_domains = [ 

36 "arxiv.org", 

37 "biorxiv.org", 

38 "medrxiv.org", 

39 "ncbi.nlm.nih.gov", 

40 "pubmed.ncbi.nlm.nih.gov", 

41 "europepmc.org", 

42 "semanticscholar.org", 

43 "researchgate.net", 

44 "academia.edu", 

45 "sciencedirect.com", 

46 "springer.com", 

47 "nature.com", 

48 "wiley.com", 

49 "ieee.org", 

50 "acm.org", 

51 "plos.org", 

52 "frontiersin.org", 

53 "mdpi.com", 

54 "acs.org", 

55 "rsc.org", 

56 "tandfonline.com", 

57 "sagepub.com", 

58 "oxford.com", 

59 "cambridge.org", 

60 "bmj.com", 

61 "nejm.org", 

62 "thelancet.com", 

63 "jamanetwork.com", 

64 "annals.org", 

65 "ahajournals.org", 

66 "cell.com", 

67 "science.org", 

68 "pnas.org", 

69 "elifesciences.org", 

70 "embopress.org", 

71 "journals.asm.org", 

72 "microbiologyresearch.org", 

73 "jvi.asm.org", 

74 "genome.cshlp.org", 

75 "genetics.org", 

76 "g3journal.org", 

77 "plantphysiol.org", 

78 "plantcell.org", 

79 "aspb.org", 

80 "bioone.org", 

81 "company-of-biologists.org", 

82 "biologists.org", 

83 "jeb.biologists.org", 

84 "dmm.biologists.org", 

85 "bio.biologists.org", 

86 "doi.org", 

87 "ssrn.com", 

88 "openreview.net", 

89 ] 

90 

91 # Check if hostname matches any downloadable domain 

92 for domain in downloadable_domains: 

93 if hostname == domain or hostname.endswith("." + domain): 

94 return True 

95 

96 # Special case for PubMed which might appear in path 

97 if "pubmed" in hostname or "/pubmed/" in path: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 return True 

99 

100 # Check for PDF in path or query parameters 

101 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query: 

102 return True 

103 

104 return False 

105 

106 except Exception: 

107 logger.warning(f"Error parsing URL {url}") 

108 return False 

109 

110 

111def is_downloadable_url(url: str) -> bool: 

112 """Check if a URL is downloadable (academic domain or direct PDF link). 

113 

114 This is the single source of truth for downloadability checks. 

115 Combines domain checking with PDF extension/path detection. 

116 

117 Args: 

118 url: The URL to check 

119 

120 Returns: 

121 True if the URL is from a downloadable academic domain or is a direct PDF link 

122 """ 

123 return is_downloadable_domain(url) 

124 

125 

126def get_document_for_resource(session, resource): 

127 """Get Document for a ResearchResource. 

128 

129 Checks resource.document_id first (library resources point directly 

130 to existing Documents), falls back to Document.resource_id lookup 

131 (web downloads create Documents with resource_id set). 

132 """ 

133 if resource.document_id: 

134 return ( 

135 session.query(Document).filter_by(id=resource.document_id).first() 

136 ) 

137 return session.query(Document).filter_by(resource_id=resource.id).first() 

138 

139 

140def get_url_hash(url: str) -> str: 

141 """ 

142 Generate a SHA256 hash of a URL. 

143 

144 Args: 

145 url: The URL to hash 

146 

147 Returns: 

148 The SHA256 hash of the URL 

149 """ 

150 return hashlib.sha256(url.lower().encode()).hexdigest() 

151 

152 

153def ensure_in_collection( 

154 session, document_id: str, collection_id: str 

155) -> "DocumentCollection": 

156 """Get or create a DocumentCollection link between a document and a collection. 

157 

158 Args: 

159 session: SQLAlchemy session 

160 document_id: UUID of the document 

161 collection_id: UUID of the collection 

162 

163 Returns: 

164 The existing or newly created DocumentCollection row 

165 """ 

166 existing = ( 

167 session.query(DocumentCollection) 

168 .filter_by(document_id=document_id, collection_id=collection_id) 

169 .first() 

170 ) 

171 if existing: 

172 return existing 

173 

174 doc_collection = DocumentCollection( 

175 document_id=document_id, 

176 collection_id=collection_id, 

177 indexed=False, 

178 ) 

179 session.add(doc_collection) 

180 return doc_collection 

181 

182 

183def get_library_storage_path(username: str) -> Path: 

184 """ 

185 Get the storage path for a user's library. 

186 

187 Uses the settings system which respects environment variable overrides: 

188 - research_library.storage_path: Base path for library storage 

189 - research_library.shared_library: If true, all users share the same directory 

190 

191 Args: 

192 username: The username 

193 

194 Returns: 

195 Path to the library storage directory 

196 """ 

197 from ...utilities.db_utils import get_settings_manager 

198 

199 settings = get_settings_manager() 

200 

201 # Get the base path from settings (uses centralized path, respects LDR_DATA_DIR) 

202 base_path = ( 

203 Path( 

204 settings.get_setting( 

205 "research_library.storage_path", 

206 str(get_library_directory()), 

207 ) 

208 ) 

209 .expanduser() 

210 .resolve() 

211 ) 

212 

213 # Check if shared library mode is enabled 

214 shared_library = settings.get_setting( 

215 "research_library.shared_library", False 

216 ) 

217 

218 if shared_library: 

219 # Shared mode: all users use the same directory 

220 base_path.mkdir(parents=True, exist_ok=True) 

221 return base_path 

222 # Default: user isolation with subdirectories 

223 user_dir = base_path / username 

224 user_dir.mkdir(parents=True, exist_ok=True) 

225 return user_dir 

226 

227 

228def open_file_location(file_path: str) -> bool: 

229 """ 

230 Open the file location in the system file manager. 

231 

232 Args: 

233 file_path: Path to the file 

234 

235 Returns: 

236 True if successful, False otherwise 

237 """ 

238 try: 

239 # Validate path is safe (blocks system dirs, path traversal) 

240 validated = PathValidator.validate_local_filesystem_path(file_path) 

241 folder = str(validated.parent) 

242 if sys.platform == "win32": 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true

243 os.startfile(folder) 

244 elif sys.platform == "darwin": # macOS 

245 result = subprocess.run( 

246 ["open", folder], capture_output=True, text=True, shell=False 

247 ) 

248 if result.returncode != 0: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 logger.error(f"Failed to open folder on macOS: {result.stderr}") 

250 return False 

251 else: # Linux 

252 result = subprocess.run( 

253 ["xdg-open", folder], 

254 capture_output=True, 

255 text=True, 

256 shell=False, 

257 ) 

258 if result.returncode != 0: 

259 logger.error(f"Failed to open folder on Linux: {result.stderr}") 

260 return False 

261 return True 

262 except Exception: 

263 logger.exception("Failed to open file location") 

264 return False 

265 

266 

267def get_absolute_library_path( 

268 relative_path: str, username: str 

269) -> Optional[Path]: 

270 """ 

271 Get the absolute path from a relative library path. 

272 

273 Uses PathValidator to prevent path traversal attacks. 

274 

275 Args: 

276 relative_path: The relative path from library root 

277 username: The username 

278 

279 Returns: 

280 The absolute path, or None if the path is unsafe 

281 """ 

282 library_root = get_library_storage_path(username) 

283 try: 

284 # Use PathValidator to prevent path traversal attacks 

285 safe_path = PathValidator.validate_safe_path( 

286 relative_path, str(library_root) 

287 ) 

288 if safe_path is None: 288 ↛ 289line 288 didn't jump to line 289 because the condition on line 288 was never true

289 return None 

290 result = Path(safe_path) 

291 if result.is_symlink(): 

292 logger.warning(f"Symlink blocked: {relative_path}") 

293 return None 

294 return result 

295 except ValueError: 

296 logger.warning(f"Path traversal blocked: {relative_path}") 

297 return None 

298 

299 

300def get_absolute_path_from_settings(relative_path: str) -> Optional[Path]: 

301 """ 

302 Get absolute path using settings manager for library root. 

303 

304 Uses PathValidator to prevent path traversal attacks. 

305 

306 Args: 

307 relative_path: The relative path from library root 

308 

309 Returns: 

310 The absolute path, or None if the path is unsafe 

311 """ 

312 from ...utilities.db_utils import get_settings_manager 

313 

314 settings = get_settings_manager() 

315 library_root = ( 

316 Path( 

317 settings.get_setting( 

318 "research_library.storage_path", 

319 str(get_library_directory()), 

320 ) 

321 ) 

322 .expanduser() 

323 .resolve() 

324 ) 

325 

326 if not relative_path: 

327 return library_root 

328 

329 try: 

330 # Use PathValidator to prevent path traversal attacks 

331 safe_path = PathValidator.validate_safe_path( 

332 relative_path, str(library_root) 

333 ) 

334 if safe_path is None: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true

335 return None 

336 result = Path(safe_path) 

337 if result.is_symlink(): 

338 logger.warning(f"Symlink blocked: {relative_path}") 

339 return None 

340 return result 

341 except ValueError: 

342 logger.warning(f"Path traversal blocked: {relative_path}") 

343 return None 

344 

345 

346def handle_api_error(operation: str, error: Exception, status_code: int = 500): 

347 """ 

348 Handle API errors consistently - log internally, return generic message to user. 

349 

350 This prevents information exposure by logging full error details internally 

351 while returning a generic message to the user. 

352 

353 Args: 

354 operation: Description of the operation that failed (for logging) 

355 error: The exception that occurred 

356 status_code: HTTP status code to return (default: 500) 

357 

358 Returns: 

359 Flask JSON response tuple (response, status_code) 

360 """ 

361 # Log the full error internally with stack trace 

362 logger.exception(f"Error during {operation}") 

363 

364 # Return generic message to user (no internal details exposed) 

365 return jsonify( 

366 { 

367 "success": False, 

368 "error": "An internal error occurred. Please try again or contact support.", 

369 } 

370 ), status_code