Coverage for src / local_deep_research / web / app_factory.py: 88%

436 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1# import logging - replaced with loguru 

2import ipaddress 

3import os 

4from pathlib import Path 

5from importlib import resources as importlib_resources 

6 

7from flask import ( 

8 Flask, 

9 Request, 

10 abort, 

11 jsonify, 

12 make_response, 

13 request, 

14 send_from_directory, 

15) 

16from flask_wtf.csrf import CSRFProtect 

17from werkzeug.middleware.proxy_fix import ProxyFix 

18from loguru import logger 

19from local_deep_research.settings.logger import log_settings 

20 

21from ..utilities.log_utils import InterceptHandler 

22from ..security import SecurityHeaders, get_security_default 

23from ..security.rate_limiter import limiter 

24from ..security.file_upload_validator import FileUploadValidator 

25 

26# Removed DB_PATH import - using per-user databases now 

27from .services.socket_service import SocketIOService 

28 

29 

30def _is_private_ip(ip_str: str) -> bool: 

31 """Check if IP is a private/local network address (RFC 1918 + localhost). 

32 

33 This allows LAN access over HTTP without requiring HTTPS, matching the 

34 behavior of other self-hosted applications like Jellyfin and Home Assistant. 

35 

36 Private ranges: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, plus localhost. 

37 """ 

38 try: 

39 ip = ipaddress.ip_address(ip_str) 

40 return ip.is_private or ip.is_loopback 

41 except ValueError: 

42 return False 

43 

44 

45class DiskSpoolingRequest(Request): 

46 """Custom Request class that spools large file uploads to disk. 

47 

48 This prevents memory exhaustion from large multipart uploads by writing 

49 files larger than max_form_memory_size to temporary files on disk instead 

50 of keeping them in memory. 

51 

52 Security fix for issue #1176: With 200 files × 50MB limit, the default 

53 behavior could consume 10GB+ of memory per request. 

54 """ 

55 

56 # Files larger than 5MB are spooled to disk instead of memory 

57 max_form_memory_size = 5 * 1024 * 1024 # 5MB threshold 

58 

59 

60def create_app(): 

61 """ 

62 Create and configure the Flask application. 

63 

64 Returns: 

65 tuple: (app, socketio) - The configured Flask app and SocketIO instance 

66 """ 

67 # Route stdlib loggers through loguru via InterceptHandler. 

68 # Guard against handler duplication when create_app() is called multiple 

69 # times (e.g. in tests). 

70 import logging 

71 

72 werkzeug_logger = logging.getLogger("werkzeug") 

73 werkzeug_logger.setLevel( 

74 logging.WARNING 

75 ) # Suppress verbose per-request logs 

76 if not any( 

77 isinstance(h, InterceptHandler) for h in werkzeug_logger.handlers 

78 ): 

79 werkzeug_logger.addHandler(InterceptHandler()) 

80 

81 # APScheduler logs job execution results (success/failure) to its own 

82 # logger hierarchy. Without an InterceptHandler the WARNING+ messages 

83 # only reach Python's lastResort handler as unformatted stderr. 

84 # Level is WARNING (not INFO) because job functions already log their 

85 # own progress via loguru — APScheduler's INFO messages would be redundant. 

86 apscheduler_logger = logging.getLogger("apscheduler") 

87 apscheduler_logger.setLevel(logging.WARNING) 

88 if not any( 

89 isinstance(h, InterceptHandler) for h in apscheduler_logger.handlers 

90 ): 

91 apscheduler_logger.addHandler(InterceptHandler()) 

92 

93 logger.info("Initializing Local Deep Research application...") 

94 

95 try: 

96 # Get directories based on package installation 

97 PACKAGE_DIR = importlib_resources.files("local_deep_research") / "web" 

98 with importlib_resources.as_file(PACKAGE_DIR) as package_dir: 

99 STATIC_DIR = (package_dir / "static").as_posix() 

100 TEMPLATE_DIR = (package_dir / "templates").as_posix() 

101 

102 # Initialize Flask app with package directories 

103 # Set static_folder to None to disable Flask's built-in static handling 

104 # We'll use our custom static route instead to handle dist folder 

105 app = Flask(__name__, static_folder=None, template_folder=TEMPLATE_DIR) 

106 # Store static dir for custom handling 

107 app.config["STATIC_DIR"] = STATIC_DIR 

108 logger.debug(f"Using package static path: {STATIC_DIR}") 

109 logger.debug(f"Using package template path: {TEMPLATE_DIR}") 

110 except Exception: 

111 # Fallback for development 

112 logger.exception("Package directories not found, using fallback paths") 

113 # Set static_folder to None to disable Flask's built-in static handling 

114 app = Flask( 

115 __name__, 

116 static_folder=None, 

117 template_folder=str(Path("templates").resolve()), 

118 ) 

119 # Store static dir for custom handling 

120 app.config["STATIC_DIR"] = str(Path("static").resolve()) 

121 

122 # Use custom Request class that spools large uploads to disk 

123 # This prevents memory exhaustion from large file uploads (issue #1176) 

124 app.request_class = DiskSpoolingRequest 

125 

126 # Add proxy support for deployments behind load balancers/reverse proxies 

127 # This ensures X-Forwarded-For and X-Forwarded-Proto headers are properly handled 

128 # Important for rate limiting and security (gets real client IP, not proxy IP) 

129 app.wsgi_app = ProxyFix( # type: ignore[method-assign] 

130 app.wsgi_app, 

131 x_for=1, # Trust 1 proxy for X-Forwarded-For 

132 x_proto=1, # Trust 1 proxy for X-Forwarded-Proto (http/https) 

133 x_host=0, # Don't trust X-Forwarded-Host (security) 

134 x_port=0, # Don't trust X-Forwarded-Port (security) 

135 x_prefix=0, # Don't trust X-Forwarded-Prefix (security) 

136 ) 

137 

138 # WSGI middleware for dynamic cookie security 

139 # This wraps AFTER ProxyFix so we have access to the real client IP 

140 # Must be WSGI level because Flask session cookies are set after after_request handlers 

141 class SecureCookieMiddleware: 

142 """WSGI middleware to add Secure flag to cookies based on request context. 

143 

144 Security model: 

145 - Localhost HTTP (127.0.0.1, ::1): Skip Secure flag (local traffic is safe) 

146 - Proxied requests (X-Forwarded-For present): Add Secure flag (production) 

147 - Non-localhost HTTP: Add Secure flag (will fail, by design - use HTTPS) 

148 - TESTING mode: Never add Secure flag (for CI/development) 

149 

150 This prevents X-Forwarded-For spoofing attacks by checking for the header's 

151 presence rather than its value - if the header exists, we're behind a proxy. 

152 """ 

153 

154 def __init__(self, wsgi_app, flask_app): 

155 self.wsgi_app = wsgi_app 

156 self.flask_app = flask_app 

157 

158 def __call__(self, environ, start_response): 

159 # Check if we should add Secure flag 

160 should_add_secure = self._should_add_secure_flag(environ) 

161 

162 def custom_start_response(status, headers, exc_info=None): 

163 if should_add_secure: 

164 # Modify Set-Cookie headers to add Secure flag 

165 new_headers = [] 

166 for name, value in headers: 

167 if name.lower() == "set-cookie": 

168 if ( 168 ↛ 173line 168 didn't jump to line 173 because the condition on line 168 was always true

169 "; Secure" not in value 

170 and "; secure" not in value 

171 ): 

172 value = value + "; Secure" 

173 new_headers.append((name, value)) 

174 headers = new_headers 

175 return start_response(status, headers, exc_info) 

176 

177 return self.wsgi_app(environ, custom_start_response) 

178 

179 def _should_add_secure_flag(self, environ): 

180 """Determine if Secure flag should be added based on request context. 

181 

182 Security model: 

183 - Check the ACTUAL connection IP (REMOTE_ADDR), not X-Forwarded-For header 

184 - SecureCookieMiddleware is outer wrapper, so we see original REMOTE_ADDR 

185 - If connection comes from private IP (client or proxy), allow HTTP 

186 - If connection comes from public IP, require HTTPS 

187 

188 This is safe because: 

189 - We never trust X-Forwarded-For header values (can be spoofed) 

190 - We only check the actual TCP connection source IP 

191 - Spoofing X-Forwarded-For from public IP doesn't bypass this check 

192 - Local proxies (nginx on localhost/LAN) have private REMOTE_ADDR 

193 """ 

194 # Skip if in explicit testing mode 

195 if self.flask_app.config.get("LDR_TESTING_MODE"): 

196 return False 

197 

198 # Check actual connection source IP (before ProxyFix modifies it) 

199 # This is either: 

200 # - Direct client IP (if no proxy) 

201 # - Proxy server IP (if behind proxy) 

202 # Local proxies (nginx on localhost, Traefik on LAN) have private IPs 

203 remote_addr = environ.get("REMOTE_ADDR", "") 

204 is_private = _is_private_ip(remote_addr) 

205 

206 # Check if HTTPS 

207 is_https = environ.get("wsgi.url_scheme") == "https" 

208 

209 # Add Secure flag if: 

210 # - Using HTTPS (always secure over HTTPS) 

211 # - OR connection is from public IP (require HTTPS for public access) 

212 return is_https or not is_private 

213 

214 # Wrap the app with our cookie security middleware 

215 app.wsgi_app = SecureCookieMiddleware(app.wsgi_app, app) # type: ignore[method-assign] 

216 

217 # WSGI middleware to remove Server header 

218 # This must be the outermost wrapper to catch headers added by Werkzeug 

219 class ServerHeaderMiddleware: 

220 """WSGI middleware to remove Server header from all responses. 

221 

222 Prevents information disclosure about the underlying web server. 

223 Must be outermost middleware to catch headers added by WSGI layer. 

224 """ 

225 

226 def __init__(self, wsgi_app): 

227 self.wsgi_app = wsgi_app 

228 

229 def __call__(self, environ, start_response): 

230 def custom_start_response(status, headers, exc_info=None): 

231 filtered_headers = [ 

232 (name, value) 

233 for name, value in headers 

234 if name.lower() != "server" 

235 ] 

236 return start_response(status, filtered_headers, exc_info) 

237 

238 return self.wsgi_app(environ, custom_start_response) 

239 

240 # Apply ServerHeaderMiddleware as outermost wrapper 

241 app.wsgi_app = ServerHeaderMiddleware(app.wsgi_app) # type: ignore[method-assign] 

242 

243 # App configuration 

244 # Generate or load a unique SECRET_KEY per installation 

245 import secrets 

246 from ..config.paths import get_data_directory 

247 

248 secret_key_file = Path(get_data_directory()) / ".secret_key" 

249 secret_key_file.parent.mkdir(parents=True, exist_ok=True) 

250 new_key = secrets.token_hex(32) 

251 try: 

252 fd = os.open( 

253 str(secret_key_file), os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600 

254 ) 

255 try: 

256 os.write(fd, new_key.encode()) 

257 finally: 

258 os.close(fd) 

259 app.config["SECRET_KEY"] = new_key 

260 logger.info("Generated new SECRET_KEY for this installation") 

261 except FileExistsError: 

262 try: 

263 with open(secret_key_file, "r") as f: 

264 app.config["SECRET_KEY"] = f.read().strip() 

265 except Exception: 

266 logger.warning("Could not read secret key file") 

267 app.config["SECRET_KEY"] = new_key 

268 except OSError: 

269 logger.warning("Could not save secret key file") 

270 app.config["SECRET_KEY"] = new_key 

271 # Session cookie security settings 

272 # SECURE flag is added dynamically based on request context (see after_request below) 

273 # This allows localhost HTTP to work for development while keeping production secure 

274 # 

275 # Check if explicitly in testing mode (for backwards compatibility) 

276 is_testing = ( 

277 os.getenv("CI") 

278 or os.getenv("TESTING") 

279 or os.getenv("PYTEST_CURRENT_TEST") 

280 or app.debug 

281 ) 

282 # Set to False - we add Secure flag dynamically in after_request handler 

283 # Exception: if TESTING mode is active, we never add Secure flag 

284 app.config["SESSION_COOKIE_SECURE"] = False 

285 app.config["LDR_TESTING_MODE"] = bool(is_testing) # Store for after_request 

286 app.config["SESSION_COOKIE_HTTPONLY"] = ( 

287 True # Prevent JavaScript access (XSS mitigation) 

288 ) 

289 app.config["SESSION_COOKIE_SAMESITE"] = "Lax" # CSRF protection 

290 # Set max cookie lifetime for permanent sessions (when session.permanent=True). 

291 # This applies to "remember me" sessions; non-permanent sessions expire on browser close. 

292 remember_me_days = get_security_default( 

293 "security.session_remember_me_days", 30 

294 ) 

295 app.config["PERMANENT_SESSION_LIFETIME"] = remember_me_days * 24 * 3600 

296 # PREFERRED_URL_SCHEME affects URL generation (url_for), not request.is_secure 

297 app.config["PREFERRED_URL_SCHEME"] = "https" 

298 

299 # File upload security limits - calculated from FileUploadValidator constants 

300 app.config["MAX_CONTENT_LENGTH"] = ( 

301 FileUploadValidator.MAX_FILES_PER_REQUEST 

302 * FileUploadValidator.MAX_FILE_SIZE 

303 ) 

304 

305 # Initialize CSRF protection 

306 # Explicitly enable CSRF protection (don't rely on implicit Flask-WTF behavior) 

307 app.config["WTF_CSRF_ENABLED"] = True 

308 CSRFProtect(app) 

309 # Exempt Socket.IO from CSRF protection 

310 # Note: Flask-SocketIO handles CSRF internally, so we don't need to exempt specific views 

311 

312 # Initialize security headers middleware 

313 SecurityHeaders(app) 

314 

315 # Initialize rate limiting for security (brute force protection) 

316 # Uses imported limiter from security.rate_limiter module 

317 # Rate limiting is disabled in CI via enabled callable in rate_limiter.py 

318 # Also set app config to ensure Flask-Limiter respects our settings 

319 from ..settings.env_registry import is_rate_limiting_enabled 

320 

321 app.config["RATELIMIT_ENABLED"] = is_rate_limiting_enabled() 

322 app.config["RATELIMIT_STRATEGY"] = "moving-window" 

323 limiter.init_app(app) 

324 

325 # Custom error handler for rate limit exceeded (429) 

326 @app.errorhandler(429) 

327 def ratelimit_handler(e): 

328 # Import here to avoid circular imports 

329 from ..security.rate_limiter import get_client_ip 

330 

331 # Audit logging for security monitoring 

332 # Use get_client_ip() to get the real IP behind proxies 

333 logger.warning( 

334 f"Rate limit exceeded: endpoint={request.endpoint} " 

335 f"ip={get_client_ip()} " 

336 f"user_agent={request.headers.get('User-Agent', 'unknown')}" 

337 ) 

338 return jsonify( 

339 error="Too many requests", 

340 message="Too many attempts. Please try again later.", 

341 ), 429 

342 

343 # Note: Dynamic cookie security is handled by SecureCookieMiddleware (WSGI level) 

344 # This is necessary because Flask's session cookies are set AFTER after_request handlers 

345 # The middleware wrapping happens below near ProxyFix 

346 

347 # Note: CSRF exemptions for API blueprints are applied after blueprint 

348 # registration below (search for "CSRF exemptions" in this file). 

349 

350 # Database configuration - Using per-user databases now 

351 # No shared database configuration needed 

352 app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False 

353 app.config["SQLALCHEMY_ECHO"] = False 

354 

355 # Per-user databases are created automatically via encrypted_db.py 

356 

357 # Log data location and security information 

358 from ..config.paths import get_data_directory 

359 from ..database.encrypted_db import db_manager 

360 

361 data_dir = get_data_directory() 

362 logger.info("=" * 60) 

363 logger.info("DATA STORAGE INFORMATION") 

364 logger.info("=" * 60) 

365 logger.info(f"Data directory: {data_dir}") 

366 logger.info( 

367 "Databases: Per-user encrypted databases in encrypted_databases/" 

368 ) 

369 

370 # Check if using custom location 

371 from local_deep_research.settings.manager import SettingsManager 

372 

373 settings_manager = SettingsManager() 

374 custom_data_dir = settings_manager.get_setting("bootstrap.data_dir") 

375 if custom_data_dir: 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true

376 logger.info( 

377 f"Using custom data location via LDR_DATA_DIR: {custom_data_dir}" 

378 ) 

379 else: 

380 logger.info("Using default platform-specific data location") 

381 

382 # Display security status based on actual SQLCipher availability 

383 if db_manager.has_encryption: 

384 logger.info( 

385 "SECURITY: Databases are encrypted with SQLCipher. Ensure appropriate file system permissions are set on the data directory." 

386 ) 

387 else: 

388 logger.warning( 

389 "SECURITY NOTICE: SQLCipher is not available - databases are NOT encrypted. " 

390 "Install SQLCipher for database encryption. Ensure appropriate file system permissions are set on the data directory." 

391 ) 

392 

393 logger.info( 

394 "TIP: You can change the data location by setting the LDR_DATA_DIR environment variable." 

395 ) 

396 logger.info("=" * 60) 

397 

398 # Initialize Vite helper for asset management 

399 from .utils.vite_helper import vite 

400 

401 vite.init_app(app) 

402 

403 # Initialize Theme helper for auto-detecting themes from CSS 

404 from .utils.theme_helper import theme_helper 

405 

406 theme_helper.init_app(app) 

407 

408 # Generate combined themes.css from individual theme files 

409 from .themes import theme_registry 

410 

411 try: 

412 static_dir = Path(app.config.get("STATIC_DIR", "static")) 

413 themes_css_path = static_dir / "css" / "themes.css" 

414 combined_css = theme_registry.get_combined_css() 

415 themes_css_path.write_text(combined_css, encoding="utf-8") 

416 logger.debug( 

417 f"Generated themes.css with {len(theme_registry.themes)} themes" 

418 ) 

419 except PermissionError: 

420 logger.warning( 

421 f"Cannot write themes.css to {themes_css_path}. " 

422 "Theme CSS will need to be pre-generated." 

423 ) 

424 except Exception: 

425 logger.exception("Error generating combined themes.css") 

426 

427 # Register socket service 

428 socket_service = SocketIOService(app=app) 

429 

430 # Initialize news subscription scheduler 

431 try: 

432 # News tables are now created per-user in their encrypted databases 

433 logger.info( 

434 "News tables will be created in per-user encrypted databases" 

435 ) 

436 

437 # Check if scheduler is enabled BEFORE importing/initializing 

438 # Use env registry which handles both env vars and settings 

439 from ..settings.env_registry import get_env_setting 

440 

441 scheduler_enabled = get_env_setting("news.scheduler.enabled", True) 

442 logger.info(f"News scheduler enabled: {scheduler_enabled}") 

443 

444 if scheduler_enabled: 

445 # Only import and initialize if enabled 

446 from ..news.subscription_manager.scheduler import ( 

447 get_news_scheduler, 

448 ) 

449 from ..settings.manager import SettingsManager 

450 

451 # Get system settings for scheduler configuration (if not already loaded) 

452 if "settings_manager" not in locals(): 452 ↛ 453line 452 didn't jump to line 453 because the condition on line 452 was never true

453 settings_manager = SettingsManager() 

454 

455 # Get scheduler instance and initialize with settings 

456 scheduler = get_news_scheduler() 

457 scheduler.initialize_with_settings(settings_manager) 

458 scheduler.set_app(app) 

459 scheduler.start() 

460 app.news_scheduler = scheduler # type: ignore[attr-defined] 

461 logger.info("News scheduler started with activity-based tracking") 

462 else: 

463 # Don't initialize scheduler if disabled 

464 app.news_scheduler = None # type: ignore[attr-defined] 

465 logger.info("News scheduler disabled - not initializing") 

466 except Exception: 

467 logger.exception("Failed to initialize news scheduler") 

468 app.news_scheduler = None # type: ignore[attr-defined] 

469 

470 # Apply middleware 

471 logger.info("Applying middleware...") 

472 apply_middleware(app) 

473 logger.info("Middleware applied successfully") 

474 

475 # Register blueprints 

476 logger.info("Registering blueprints...") 

477 register_blueprints(app) 

478 logger.info("Blueprints registered successfully") 

479 

480 # Register error handlers 

481 logger.info("Registering error handlers...") 

482 register_error_handlers(app) 

483 logger.info("Error handlers registered successfully") 

484 

485 # Start the queue processor v2 (uses encrypted databases) 

486 # Always start the processor - it will handle per-user queue modes 

487 logger.info("Starting queue processor v2...") 

488 from .queue.processor_v2 import queue_processor 

489 

490 queue_processor.start() 

491 logger.info("Started research queue processor v2") 

492 

493 logger.info("App factory completed successfully") 

494 

495 return app, socket_service 

496 

497 

498def apply_middleware(app): 

499 """Apply middleware to the Flask app.""" 

500 

501 # Import auth decorators and middleware 

502 logger.info("Importing cleanup_middleware...") 

503 from .auth.cleanup_middleware import cleanup_completed_research 

504 

505 logger.info("Importing database_middleware...") 

506 from .auth.database_middleware import ensure_user_database 

507 

508 logger.info("Importing decorators...") 

509 from .auth.decorators import inject_current_user 

510 

511 logger.info("Importing queue_middleware...") 

512 from .auth.queue_middleware import process_pending_queue_operations 

513 

514 logger.info("Importing queue_middleware_v2...") 

515 from .auth.queue_middleware_v2 import notify_queue_processor 

516 

517 logger.info("Importing session_cleanup...") 

518 from .auth.session_cleanup import cleanup_stale_sessions 

519 

520 logger.info("All middleware imports completed") 

521 

522 # Register authentication middleware 

523 # First clean up stale sessions 

524 app.before_request(cleanup_stale_sessions) 

525 # Then ensure database is open for authenticated users 

526 app.before_request(ensure_user_database) 

527 # Then inject current user into g 

528 app.before_request(inject_current_user) 

529 # Clean up completed research records 

530 app.before_request(cleanup_completed_research) 

531 # Process any pending queue operations for this user (direct mode) 

532 app.before_request(process_pending_queue_operations) 

533 # Notify queue processor of user activity (queue mode) 

534 app.before_request(notify_queue_processor) 

535 

536 logger.info("All middleware registered") 

537 

538 # Flush any queued logs from background threads 

539 logger.info("Importing log_utils...") 

540 from ..utilities.log_utils import flush_log_queue 

541 

542 app.before_request(flush_log_queue) 

543 logger.info("Log flushing middleware registered") 

544 

545 # Inject backend constants into Jinja2 templates for frontend JS. 

546 # This is the Flask-documented pattern for sharing Python enums with JavaScript. 

547 # Source of truth: src/local_deep_research/constants.py::ResearchStatus 

548 # Frontend helpers: src/local_deep_research/web/static/js/config/constants.js 

549 # Template injection: src/local_deep_research/web/templates/base.html 

550 from ..constants import ResearchStatus 

551 

552 @app.context_processor 

553 def inject_frontend_constants(): 

554 terminal = [ 

555 ResearchStatus.COMPLETED, 

556 ResearchStatus.SUSPENDED, 

557 ResearchStatus.FAILED, 

558 ResearchStatus.ERROR, 

559 ResearchStatus.CANCELLED, 

560 ] 

561 return { 

562 "research_status_enum": {m.name: m.value for m in ResearchStatus}, 

563 "research_terminal_states": [str(s) for s in terminal], 

564 } 

565 

566 # Clean up database sessions after each request 

567 @app.teardown_appcontext 

568 def cleanup_db_session(exception=None): 

569 """Clean up database session after each request to avoid cross-thread issues.""" 

570 from flask import g 

571 

572 session = g.pop("db_session", None) 

573 if session is not None: 

574 try: 

575 session.rollback() 

576 except Exception: 

577 logger.warning( 

578 "Error rolling back request session during cleanup" 

579 ) 

580 try: 

581 session.close() 

582 except Exception: 

583 logger.warning("Error closing request session during cleanup") 

584 

585 # Sweep credential entries for dead threads. Multiple trigger 

586 # points (here, processor_v2, and connection_cleanup scheduler) 

587 # ensure sweeps happen regardless of traffic patterns. 

588 try: 

589 from ..database.thread_local_session import cleanup_dead_threads 

590 

591 cleanup_dead_threads() 

592 except Exception: 

593 logger.debug("Error during dead thread sweep", exc_info=True) 

594 

595 # Clean up any thread-local database session that may have been created 

596 # via get_metrics_session() fallback in session_context.py (e.g. background 

597 # threads or error paths where g.db_session was unavailable). 

598 try: 

599 from ..database.thread_local_session import cleanup_current_thread 

600 

601 cleanup_current_thread() 

602 except Exception: 

603 logger.debug( 

604 "Error during thread-local session cleanup", exc_info=True 

605 ) 

606 

607 # Add a middleware layer to handle abrupt disconnections 

608 @app.before_request 

609 def handle_websocket_requests(): 

610 if request.path.startswith("/socket.io"): 

611 try: 

612 if not request.environ.get("werkzeug.socket"): 612 ↛ 618line 612 didn't jump to line 618 because the condition on line 612 was always true

613 return None 

614 except Exception: 

615 logger.exception("WebSocket preprocessing error") 

616 # Return empty response to prevent further processing 

617 return "", 200 

618 return None 

619 

620 # Note: CORS headers for API routes are now handled by SecurityHeaders middleware 

621 # (see src/local_deep_research/security/security_headers.py) 

622 

623 

624def register_blueprints(app): 

625 """Register blueprints with the Flask app.""" 

626 

627 # Import blueprints 

628 logger.info("Importing blueprints...") 

629 

630 # Import benchmark blueprint 

631 from ..benchmarks.web_api.benchmark_routes import benchmark_bp 

632 

633 logger.info("Importing API blueprint...") 

634 from .api import api_blueprint # Import the API blueprint 

635 

636 logger.info("Importing auth blueprint...") 

637 from .auth import auth_bp # Import the auth blueprint 

638 

639 logger.info("Importing API routes blueprint...") 

640 from .routes.api_routes import api_bp # Import the API blueprint 

641 

642 logger.info("Importing context overflow API...") 

643 from .routes.context_overflow_api import ( 

644 context_overflow_bp, 

645 ) # Import context overflow API 

646 

647 logger.info("Importing history routes...") 

648 from .routes.history_routes import history_bp 

649 

650 logger.info("Importing metrics routes...") 

651 from .routes.metrics_routes import metrics_bp 

652 

653 logger.info("Importing research routes...") 

654 from .routes.research_routes import research_bp 

655 

656 logger.info("Importing settings routes...") 

657 from .routes.settings_routes import settings_bp 

658 

659 logger.info("All core blueprints imported successfully") 

660 

661 # Add root route 

662 @app.route("/") 

663 def index(): 

664 """Root route - redirect to login if not authenticated""" 

665 from flask import redirect, session, url_for 

666 

667 from ..constants import get_available_strategies 

668 from ..database.session_context import get_user_db_session 

669 from ..utilities.db_utils import get_settings_manager 

670 from .utils.templates import render_template_with_defaults 

671 

672 # Check if user is authenticated 

673 if "username" not in session: 

674 return redirect(url_for("auth.login")) 

675 

676 # Load current settings from database using proper session context 

677 username = session.get("username") 

678 settings = {} 

679 show_all = False 

680 with get_user_db_session(username) as db_session: 

681 if db_session: 681 ↛ 718line 681 didn't jump to line 718

682 settings_manager = get_settings_manager(db_session, username) 

683 settings = { 

684 "llm_provider": settings_manager.get_setting( 

685 "llm.provider", "ollama" 

686 ), 

687 "llm_model": settings_manager.get_setting("llm.model", ""), 

688 "llm_openai_endpoint_url": settings_manager.get_setting( 

689 "llm.openai_endpoint.url", "" 

690 ), 

691 "llm_ollama_url": settings_manager.get_setting( 

692 "llm.ollama.url" 

693 ), 

694 "llm_lmstudio_url": settings_manager.get_setting( 

695 "llm.lmstudio.url" 

696 ), 

697 "llm_local_context_window_size": settings_manager.get_setting( 

698 "llm.local_context_window_size" 

699 ), 

700 "search_tool": settings_manager.get_setting( 

701 "search.tool", "" 

702 ), 

703 "search_iterations": settings_manager.get_setting( 

704 "search.iterations", 3 

705 ), 

706 "search_questions_per_iteration": settings_manager.get_setting( 

707 "search.questions_per_iteration", 2 

708 ), 

709 "search_strategy": settings_manager.get_setting( 

710 "search.search_strategy", "source-based" 

711 ), 

712 } 

713 show_all = settings_manager.get_setting( 

714 "search.show_all_strategies", False 

715 ) 

716 

717 # Debug logging 

718 log_settings(settings, "Research page settings loaded") 

719 

720 return render_template_with_defaults( 

721 "pages/research.html", 

722 settings=settings, 

723 strategies=get_available_strategies(show_all=bool(show_all)), 

724 ) 

725 

726 # Register auth blueprint FIRST (so login page is accessible) 

727 app.register_blueprint(auth_bp) # Already has url_prefix="/auth" 

728 

729 # Register other blueprints 

730 app.register_blueprint(research_bp) 

731 app.register_blueprint(history_bp) # Already has url_prefix="/history" 

732 app.register_blueprint(metrics_bp) 

733 app.register_blueprint(settings_bp) # Already has url_prefix="/settings" 

734 app.register_blueprint( 

735 api_bp, url_prefix="/research/api" 

736 ) # Register API blueprint with prefix 

737 app.register_blueprint(benchmark_bp) # Register benchmark blueprint 

738 app.register_blueprint( 

739 context_overflow_bp, url_prefix="/metrics" 

740 ) # Register context overflow API 

741 

742 # Register news API routes 

743 from .routes import news_routes 

744 

745 app.register_blueprint(news_routes.bp) 

746 logger.info("News API routes registered successfully") 

747 

748 # Register follow-up research routes 

749 from ..followup_research.routes import followup_bp 

750 

751 app.register_blueprint(followup_bp) 

752 logger.info("Follow-up research routes registered successfully") 

753 

754 # Register news page blueprint 

755 from ..news.web import create_news_blueprint 

756 

757 news_bp = create_news_blueprint() 

758 app.register_blueprint(news_bp, url_prefix="/news") 

759 logger.info("News page routes registered successfully") 

760 

761 # Register API v1 blueprint 

762 app.register_blueprint(api_blueprint) # Already has url_prefix='/api/v1' 

763 

764 # Register Research Library blueprint 

765 from ..research_library import library_bp, rag_bp, delete_bp 

766 

767 app.register_blueprint(library_bp) # Already has url_prefix='/library' 

768 logger.info("Research Library routes registered successfully") 

769 

770 # Register RAG Management blueprint 

771 app.register_blueprint(rag_bp) # Already has url_prefix='/library' 

772 logger.info("RAG Management routes registered successfully") 

773 

774 # Register Deletion Management blueprint 

775 app.register_blueprint(delete_bp) # Already has url_prefix='/library/api' 

776 logger.info("Deletion Management routes registered successfully") 

777 

778 # Register Semantic Search blueprint 

779 from ..research_library.search import search_bp 

780 

781 app.register_blueprint(search_bp) # url_prefix='/library' 

782 logger.info("Semantic Search routes registered successfully") 

783 

784 # Register Document Scheduler blueprint 

785 from ..research_scheduler.routes import scheduler_bp 

786 

787 app.register_blueprint(scheduler_bp) 

788 logger.info("Document Scheduler routes registered successfully") 

789 

790 # CSRF exemptions — Flask-WTF requires Blueprint objects (not strings) 

791 # to populate _exempt_blueprints. Passing strings only populates 

792 # _exempt_views, which compares against module-qualified names and 

793 # silently fails to match Flask endpoint names. 

794 if hasattr(app, "extensions") and "csrf" in app.extensions: 

795 csrf = app.extensions["csrf"] 

796 # Only api_v1 is exempt: it's a programmatic REST API used by 

797 # external clients. The api, benchmark, and research blueprints 

798 # are browser-facing and the frontend already sends CSRF tokens. 

799 for bp_name in ("api_v1",): 

800 bp_obj = app.blueprints.get(bp_name) 

801 if bp_obj is not None: 801 ↛ 799line 801 didn't jump to line 799 because the condition on line 801 was always true

802 csrf.exempt(bp_obj) 

803 

804 # Add favicon route 

805 # Exempt favicon from rate limiting 

806 @app.route("/favicon.ico") 

807 @limiter.exempt 

808 def favicon(): 

809 static_dir = app.config.get("STATIC_DIR", "static") 

810 return send_from_directory( 

811 static_dir, "favicon.ico", mimetype="image/x-icon" 

812 ) 

813 

814 # Add static route at the app level for compatibility 

815 # Exempt static files from rate limiting 

816 import re 

817 

818 _HASHED_FILENAME_RE = re.compile(r"\.[A-Za-z0-9_-]{8,}\.") 

819 

820 @app.route("/static/<path:path>") 

821 @limiter.exempt 

822 def app_serve_static(path): 

823 from ..security.path_validator import PathValidator 

824 

825 static_dir = Path(app.config.get("STATIC_DIR", "static")) 

826 

827 # First try to serve from dist directory (for built assets). 

828 # Flask captures path as "dist/js/app.abc.js", so strip the 

829 # "dist/" prefix before joining with dist_dir to avoid a 

830 # double-dist path (static/dist/dist/...). 

831 dist_prefix = "dist/" 

832 dist_dir = static_dir / "dist" 

833 if path.startswith(dist_prefix): 833 ↛ 834line 833 didn't jump to line 834 because the condition on line 833 was never true

834 dist_relative = path[len(dist_prefix) :] 

835 try: 

836 validated_path = PathValidator.validate_safe_path( 

837 dist_relative, 

838 dist_dir, 

839 allow_absolute=False, 

840 required_extensions=None, 

841 ) 

842 

843 if validated_path and validated_path.exists(): 

844 response = make_response( 

845 send_from_directory(str(dist_dir), dist_relative) 

846 ) 

847 if _HASHED_FILENAME_RE.search(dist_relative): 

848 # Content-hashed files are safe for immutable caching 

849 response.headers["Cache-Control"] = ( 

850 "public, max-age=31536000, immutable" 

851 ) 

852 else: 

853 response.headers["Cache-Control"] = ( 

854 "public, max-age=0, must-revalidate" 

855 ) 

856 return response 

857 except ValueError: 

858 pass 

859 

860 # Fall back to dist directory for Vite-built assets (fonts, etc.) 

861 # Vite uses base: '/static/' so CSS references /static/fonts/... 

862 # but the files live in static/dist/fonts/... 

863 try: 

864 validated_path = PathValidator.validate_safe_path( 

865 path, dist_dir, allow_absolute=False, required_extensions=None 

866 ) 

867 

868 if validated_path and validated_path.exists(): 868 ↛ 869line 868 didn't jump to line 869 because the condition on line 868 was never true

869 response = make_response( 

870 send_from_directory(str(dist_dir), path) 

871 ) 

872 if _HASHED_FILENAME_RE.search(path): 

873 response.headers["Cache-Control"] = ( 

874 "public, max-age=31536000, immutable" 

875 ) 

876 else: 

877 response.headers["Cache-Control"] = ( 

878 "public, max-age=0, must-revalidate" 

879 ) 

880 return response 

881 except ValueError: 

882 pass 

883 

884 # Fall back to regular static folder 

885 try: 

886 validated_path = PathValidator.validate_safe_path( 

887 path, static_dir, allow_absolute=False, required_extensions=None 

888 ) 

889 

890 if validated_path and validated_path.exists(): 890 ↛ 891line 890 didn't jump to line 891 because the condition on line 890 was never true

891 response = make_response( 

892 send_from_directory(str(static_dir), path) 

893 ) 

894 # Non-hashed files must revalidate on each request 

895 response.headers["Cache-Control"] = ( 

896 "public, max-age=0, must-revalidate" 

897 ) 

898 return response 

899 except ValueError: 

900 # Path validation failed 

901 pass 

902 

903 abort(404) 

904 return None 

905 

906 

907def register_error_handlers(app): 

908 """Register error handlers with the Flask app.""" 

909 

910 @app.errorhandler(404) 

911 def not_found(error): 

912 if request.path.startswith("/api/"): 

913 return make_response(jsonify({"error": "Not found"}), 404) 

914 return make_response("Not found", 404) 

915 

916 @app.errorhandler(500) 

917 def server_error(error): 

918 if request.path.startswith("/api/"): 

919 return make_response(jsonify({"error": "Server error"}), 500) 

920 return make_response("Server error", 500) 

921 

922 @app.errorhandler(401) 

923 def handle_unauthorized(error): 

924 if request.path.startswith("/api/") or request.path.startswith( 

925 "/settings/api/" 

926 ): 

927 return make_response( 

928 jsonify({"error": "Authentication required"}), 

929 401, 

930 ) 

931 from .auth.decorators import _safe_redirect_to_login 

932 

933 return _safe_redirect_to_login() 

934 

935 @app.errorhandler(413) 

936 def handle_request_too_large(error): 

937 if request.path.startswith("/api/"): 

938 return make_response( 

939 jsonify({"error": "Request too large"}), 

940 413, 

941 ) 

942 return make_response("Request too large", 413) 

943 

944 from .exceptions import WebAPIException 

945 

946 @app.errorhandler(WebAPIException) 

947 def handle_web_api_exception(error): 

948 """Handle WebAPIException and return JSON.""" 

949 logger.error( 

950 "Web API error: {} (status {})", error.error_code, error.status_code 

951 ) 

952 return jsonify(error.to_dict()), error.status_code 

953 

954 # Handle CSRF validation errors with helpful message 

955 try: 

956 from flask_wtf.csrf import CSRFError 

957 

958 @app.errorhandler(CSRFError) 

959 def handle_csrf_error(error): 

960 """Handle CSRF errors with helpful debugging info.""" 

961 # Check if this might be a Secure cookie issue over HTTP 

962 is_http = not request.is_secure 

963 is_private = _is_private_ip(request.remote_addr or "") 

964 is_proxied = request.headers.get("X-Forwarded-For") is not None 

965 

966 error_msg = str(error.description) 

967 

968 # Provide detailed help for HTTP + public IP or proxied scenario 

969 if is_http and (not is_private or is_proxied): 

970 logger.warning( 

971 f"CSRF validation failed - likely due to Secure cookie over HTTP. " 

972 f"remote_addr={request.remote_addr}, proxied={is_proxied}, " 

973 f"host={request.host}" 

974 ) 

975 error_msg = ( 

976 "Session cookie error: You're accessing over HTTP from a " 

977 "public IP address or through a proxy. " 

978 "This is blocked for security reasons.\n\n" 

979 "Solutions:\n" 

980 "1. Use HTTPS with a reverse proxy (recommended for production)\n" 

981 "2. Access from your local network (LAN IPs like 192.168.x.x work over HTTP)\n" 

982 "3. Access directly from localhost (http://127.0.0.1:5000)\n" 

983 "4. Use SSH tunnel: ssh -L 5000:localhost:5000 user@server, " 

984 "then access http://localhost:5000\n\n" 

985 "Note: LAN access (192.168.x.x, 10.x.x.x, 172.16-31.x.x) works over HTTP. " 

986 "Only public internet access requires HTTPS." 

987 ) 

988 

989 return make_response(jsonify({"error": error_msg}), 400) 

990 except ImportError: 

991 pass 

992 

993 # Handle News API exceptions globally 

994 try: 

995 from ..news.exceptions import NewsAPIException 

996 

997 @app.errorhandler(NewsAPIException) 

998 def handle_news_api_exception(error): 

999 """Handle NewsAPIException and convert to JSON response.""" 

1000 from loguru import logger 

1001 

1002 logger.error( 

1003 "News API error: {} (status {})", 

1004 error.error_code, 

1005 error.status_code, 

1006 ) 

1007 return jsonify(error.to_dict()), error.status_code 

1008 except ImportError: 

1009 # News module not available 

1010 pass 

1011 

1012 

1013def create_database(app): 

1014 """ 

1015 DEPRECATED: Database creation is now handled per-user via encrypted_db.py 

1016 This function is kept for compatibility but does nothing. 

1017 """ 

1018 pass