Coverage for src / local_deep_research / research_library / utils / __init__.py: 90%

107 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Shared utility functions for the Research Library.""" 

2 

3import hashlib 

4import os 

5import subprocess 

6import sys 

7from pathlib import Path 

8from typing import Optional 

9from urllib.parse import urlparse 

10 

11from flask import jsonify 

12from loguru import logger 

13 

14from ...config.paths import get_library_directory 

15from ...database.models.library import Document 

16from ...security.path_validator import PathValidator 

17 

18 

19def is_downloadable_domain(url: str) -> bool: 

20 """Check if URL is from a downloadable academic domain using proper URL parsing.""" 

21 try: 

22 if not url: 

23 return False 

24 

25 parsed = urlparse(url.lower()) 

26 hostname = parsed.hostname or "" 

27 path = parsed.path or "" 

28 query = parsed.query or "" 

29 

30 # Check for direct PDF files 

31 if path.endswith(".pdf") or ".pdf?" in url.lower(): 

32 return True 

33 

34 # List of downloadable academic domains 

35 downloadable_domains = [ 

36 "arxiv.org", 

37 "biorxiv.org", 

38 "medrxiv.org", 

39 "ncbi.nlm.nih.gov", 

40 "pubmed.ncbi.nlm.nih.gov", 

41 "europepmc.org", 

42 "semanticscholar.org", 

43 "researchgate.net", 

44 "academia.edu", 

45 "sciencedirect.com", 

46 "springer.com", 

47 "nature.com", 

48 "wiley.com", 

49 "ieee.org", 

50 "acm.org", 

51 "plos.org", 

52 "frontiersin.org", 

53 "mdpi.com", 

54 "acs.org", 

55 "rsc.org", 

56 "tandfonline.com", 

57 "sagepub.com", 

58 "oxford.com", 

59 "cambridge.org", 

60 "bmj.com", 

61 "nejm.org", 

62 "thelancet.com", 

63 "jamanetwork.com", 

64 "annals.org", 

65 "ahajournals.org", 

66 "cell.com", 

67 "science.org", 

68 "pnas.org", 

69 "elifesciences.org", 

70 "embopress.org", 

71 "journals.asm.org", 

72 "microbiologyresearch.org", 

73 "jvi.asm.org", 

74 "genome.cshlp.org", 

75 "genetics.org", 

76 "g3journal.org", 

77 "plantphysiol.org", 

78 "plantcell.org", 

79 "aspb.org", 

80 "bioone.org", 

81 "company-of-biologists.org", 

82 "biologists.org", 

83 "jeb.biologists.org", 

84 "dmm.biologists.org", 

85 "bio.biologists.org", 

86 "doi.org", 

87 "ssrn.com", 

88 "openreview.net", 

89 ] 

90 

91 # Check if hostname matches any downloadable domain 

92 for domain in downloadable_domains: 

93 if hostname == domain or hostname.endswith("." + domain): 

94 return True 

95 

96 # Special case for PubMed which might appear in path 

97 if "pubmed" in hostname or "/pubmed/" in path: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 return True 

99 

100 # Check for PDF in path or query parameters 

101 if "/pdf/" in path or "type=pdf" in query or "format=pdf" in query: 

102 return True 

103 

104 return False 

105 

106 except Exception: 

107 logger.warning(f"Error parsing URL {url}") 

108 return False 

109 

110 

111def is_downloadable_url(url: str) -> bool: 

112 """Check if a URL is downloadable (academic domain or direct PDF link). 

113 

114 This is the single source of truth for downloadability checks. 

115 Combines domain checking with PDF extension/path detection. 

116 

117 Args: 

118 url: The URL to check 

119 

120 Returns: 

121 True if the URL is from a downloadable academic domain or is a direct PDF link 

122 """ 

123 return is_downloadable_domain(url) 

124 

125 

126def get_document_for_resource(session, resource): 

127 """Get Document for a ResearchResource. 

128 

129 Checks resource.document_id first (library resources point directly 

130 to existing Documents), falls back to Document.resource_id lookup 

131 (web downloads create Documents with resource_id set). 

132 """ 

133 if resource.document_id: 

134 return ( 

135 session.query(Document).filter_by(id=resource.document_id).first() 

136 ) 

137 return session.query(Document).filter_by(resource_id=resource.id).first() 

138 

139 

140def get_url_hash(url: str) -> str: 

141 """ 

142 Generate a SHA256 hash of a URL. 

143 

144 Args: 

145 url: The URL to hash 

146 

147 Returns: 

148 The SHA256 hash of the URL 

149 """ 

150 return hashlib.sha256(url.lower().encode()).hexdigest() 

151 

152 

153def get_library_storage_path(username: str) -> Path: 

154 """ 

155 Get the storage path for a user's library. 

156 

157 Uses the settings system which respects environment variable overrides: 

158 - research_library.storage_path: Base path for library storage 

159 - research_library.shared_library: If true, all users share the same directory 

160 

161 Args: 

162 username: The username 

163 

164 Returns: 

165 Path to the library storage directory 

166 """ 

167 from ...utilities.db_utils import get_settings_manager 

168 

169 settings = get_settings_manager() 

170 

171 # Get the base path from settings (uses centralized path, respects LDR_DATA_DIR) 

172 base_path = ( 

173 Path( 

174 settings.get_setting( 

175 "research_library.storage_path", 

176 str(get_library_directory()), 

177 ) 

178 ) 

179 .expanduser() 

180 .resolve() 

181 ) 

182 

183 # Check if shared library mode is enabled 

184 shared_library = settings.get_setting( 

185 "research_library.shared_library", False 

186 ) 

187 

188 if shared_library: 

189 # Shared mode: all users use the same directory 

190 base_path.mkdir(parents=True, exist_ok=True) 

191 return base_path 

192 # Default: user isolation with subdirectories 

193 user_dir = base_path / username 

194 user_dir.mkdir(parents=True, exist_ok=True) 

195 return user_dir 

196 

197 

198def open_file_location(file_path: str) -> bool: 

199 """ 

200 Open the file location in the system file manager. 

201 

202 Args: 

203 file_path: Path to the file 

204 

205 Returns: 

206 True if successful, False otherwise 

207 """ 

208 try: 

209 # Validate path is safe (blocks system dirs, path traversal) 

210 validated = PathValidator.validate_local_filesystem_path(file_path) 

211 folder = str(validated.parent) 

212 if sys.platform == "win32": 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 os.startfile(folder) 

214 elif sys.platform == "darwin": # macOS 

215 result = subprocess.run( 

216 ["open", folder], capture_output=True, text=True, shell=False 

217 ) 

218 if result.returncode != 0: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true

219 logger.error(f"Failed to open folder on macOS: {result.stderr}") 

220 return False 

221 else: # Linux 

222 result = subprocess.run( 

223 ["xdg-open", folder], 

224 capture_output=True, 

225 text=True, 

226 shell=False, 

227 ) 

228 if result.returncode != 0: 

229 logger.error(f"Failed to open folder on Linux: {result.stderr}") 

230 return False 

231 return True 

232 except Exception: 

233 logger.exception("Failed to open file location") 

234 return False 

235 

236 

237def get_absolute_library_path( 

238 relative_path: str, username: str 

239) -> Optional[Path]: 

240 """ 

241 Get the absolute path from a relative library path. 

242 

243 Uses PathValidator to prevent path traversal attacks. 

244 

245 Args: 

246 relative_path: The relative path from library root 

247 username: The username 

248 

249 Returns: 

250 The absolute path, or None if the path is unsafe 

251 """ 

252 library_root = get_library_storage_path(username) 

253 try: 

254 # Use PathValidator to prevent path traversal attacks 

255 safe_path = PathValidator.validate_safe_path( 

256 relative_path, str(library_root) 

257 ) 

258 if safe_path is None: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 return None 

260 result = Path(safe_path) 

261 if result.is_symlink(): 

262 logger.warning(f"Symlink blocked: {relative_path}") 

263 return None 

264 return result 

265 except ValueError: 

266 logger.warning(f"Path traversal blocked: {relative_path}") 

267 return None 

268 

269 

270def get_absolute_path_from_settings(relative_path: str) -> Optional[Path]: 

271 """ 

272 Get absolute path using settings manager for library root. 

273 

274 Uses PathValidator to prevent path traversal attacks. 

275 

276 Args: 

277 relative_path: The relative path from library root 

278 

279 Returns: 

280 The absolute path, or None if the path is unsafe 

281 """ 

282 from ...utilities.db_utils import get_settings_manager 

283 

284 settings = get_settings_manager() 

285 library_root = ( 

286 Path( 

287 settings.get_setting( 

288 "research_library.storage_path", 

289 str(get_library_directory()), 

290 ) 

291 ) 

292 .expanduser() 

293 .resolve() 

294 ) 

295 

296 if not relative_path: 

297 return library_root 

298 

299 try: 

300 # Use PathValidator to prevent path traversal attacks 

301 safe_path = PathValidator.validate_safe_path( 

302 relative_path, str(library_root) 

303 ) 

304 if safe_path is None: 304 ↛ 305line 304 didn't jump to line 305 because the condition on line 304 was never true

305 return None 

306 result = Path(safe_path) 

307 if result.is_symlink(): 

308 logger.warning(f"Symlink blocked: {relative_path}") 

309 return None 

310 return result 

311 except ValueError: 

312 logger.warning(f"Path traversal blocked: {relative_path}") 

313 return None 

314 

315 

316def handle_api_error(operation: str, error: Exception, status_code: int = 500): 

317 """ 

318 Handle API errors consistently - log internally, return generic message to user. 

319 

320 This prevents information exposure by logging full error details internally 

321 while returning a generic message to the user. 

322 

323 Args: 

324 operation: Description of the operation that failed (for logging) 

325 error: The exception that occurred 

326 status_code: HTTP status code to return (default: 500) 

327 

328 Returns: 

329 Flask JSON response tuple (response, status_code) 

330 """ 

331 # Log the full error internally with stack trace 

332 logger.exception(f"Error during {operation}") 

333 

334 # Return generic message to user (no internal details exposed) 

335 return jsonify( 

336 { 

337 "success": False, 

338 "error": "An internal error occurred. Please try again or contact support.", 

339 } 

340 ), status_code