Coverage for src/local_deep_research/scheduler/background.py: 92%

1"""

2Activity-based news subscription scheduler for per-user encrypted databases.

3Tracks user activity and temporarily stores credentials for automatic updates.

4"""

6import random

7import threading

8from dataclasses import dataclass

9from datetime import datetime, timedelta, UTC

10from functools import wraps

11from typing import Any, Callable, Dict, List

13from cachetools import TTLCache

14from loguru import logger

15from ..settings.logger import log_settings

16from ..settings.manager import SnapshotSettingsContext

18from apscheduler.schedulers.background import BackgroundScheduler

19from apscheduler.jobstores.base import JobLookupError

20from sqlalchemy import func

21from ..constants import ResearchStatus

22from ..database.credential_store_base import CredentialStoreBase

23from ..database.session_context import safe_rollback

24from ..database.thread_local_session import thread_cleanup

25from ..security.log_sanitizer import redact_secrets

27# RAG indexing imports. The reconciler builds RAG services via the lazily

28# imported ``rag_service_factory`` (see _reconcile_unindexed_documents), so the

29# concrete ``LibraryRAGService`` is no longer imported at module scope.

30from ..database.library_init import get_default_library_id

31from ..database.models.library import Document, DocumentCollection

32from ..constants import DEFAULT_SEARCH_TOOL

35SCHEDULER_AVAILABLE = True # Always available since it's a required dependency

37# Per-tick cap for the opt-in library-collection index sweep. Bounds the work

38# done in a single scheduler thread tick so a large backlog of unindexed

39# documents self-heals gradually over successive ticks instead of blocking the

40# worker thread (the sweep is self-rate-limited by APScheduler max_instances=1

41# plus this batch cap).

42_LIBRARY_SWEEP_BATCH = 50

45class SchedulerCredentialStore(CredentialStoreBase):

46 """Credential store for the news scheduler.

48 Stores user passwords with TTL expiration so that background scheduler

49 jobs can access encrypted per-user databases.

50 """

52 def __init__(self, ttl_hours: int = 48):

53 super().__init__(ttl_hours * 3600)

55 def store(self, username: str, password: str) -> None:

56 """Store password for a user."""

57 self._store_credentials(

58 username, {"username": username, "password": password}

59 )

61 def retrieve(self, username: str) -> str | None:

62 """Retrieve password for a user. Returns None if expired/missing."""

63 result = self._retrieve_credentials(username, remove=False)

64 return result[1] if result else None

66 def clear(self, username: str) -> None:

67 """Clear stored password for a user."""

68 self.clear_entry(username)

71@dataclass(frozen=True)

72class DocumentSchedulerSettings:

73 """

74 Immutable settings snapshot for document scheduler.

76 Thread-safe: This is a frozen dataclass that can be safely passed

77 to and used from background threads.

78 """

80 enabled: bool = True

81 interval_seconds: int = 1800

82 download_pdfs: bool = False

83 extract_text: bool = True

84 generate_rag: bool = False

85 sweep_library_collections: bool = False

86 last_run: str = ""

88 @classmethod

89 def defaults(cls) -> "DocumentSchedulerSettings":

90 """Return default settings."""

91 return cls()

94class BackgroundJobScheduler:

95 """

96 Singleton scheduler that manages news subscriptions for active users.

98 This scheduler:

99 - Monitors user activity through database access

100 - Temporarily stores user credentials in memory

101 - Automatically schedules subscription checks

102 - Cleans up inactive users after configurable period

103 """

104

105 _instance = None

106 _lock = threading.Lock()

107

108 def __new__(cls):

109 """Ensure singleton instance."""

110 if cls._instance is None:

111 with cls._lock:

112 if cls._instance is None: 112 ↛ 114line 112 didn't jump to line 114

113 cls._instance = super().__new__(cls)

114 return cls._instance

115

116 def __init__(self):

117 """Initialize the scheduler (only runs once due to singleton)."""

118 # Skip if already initialized

119 if hasattr(self, "_initialized"):

120 return

121

122 # User session tracking

123 self.user_sessions = {} # user_id -> {last_activity, scheduled_jobs}

124 self.lock = threading.Lock()

125

126 # Credential store with TTL-based expiration

127 self._credential_store = SchedulerCredentialStore(ttl_hours=48)

128

129 # Scheduler instance

130 self.scheduler = BackgroundScheduler()

131

132 # Configuration (will be loaded from settings)

133 self.config = self._load_default_config()

134

135 # State

136 self.is_running = False

137 self._app = None # Flask app reference for background job contexts

138

139 # Settings cache: username -> DocumentSchedulerSettings

140 # TTL of 300 seconds (5 minutes) reduces database queries

141 self._settings_cache: TTLCache = TTLCache(maxsize=100, ttl=300)

142 self._settings_cache_lock = threading.Lock()

143

144 self._initialized = True

145 logger.info("News scheduler initialized")

146

147 def _load_default_config(self) -> Dict[str, Any]:

148 """Load default configuration (will be overridden by settings manager)."""

149 return {

150 "enabled": True,

151 "retention_hours": 48,

152 "cleanup_interval_hours": 1,

153 "max_jitter_seconds": 300,

154 "max_concurrent_jobs": 10,

155 "subscription_batch_size": 5,

156 "activity_check_interval_minutes": 5,

157 }

158

159 def initialize_with_settings(self, settings_manager):

160 """Initialize configuration from settings manager."""

161 try:

162 # Load all scheduler settings

163 self.settings_manager = settings_manager

164 self.config = {

165 "enabled": self._get_setting("news.scheduler.enabled", True),

166 "retention_hours": self._get_setting(

167 "news.scheduler.retention_hours", 48

168 ),

169 "cleanup_interval_hours": self._get_setting(

170 "news.scheduler.cleanup_interval_hours", 1

171 ),

172 "max_jitter_seconds": self._get_setting(

173 "news.scheduler.max_jitter_seconds", 300

174 ),

175 "max_concurrent_jobs": self._get_setting(

176 "news.scheduler.max_concurrent_jobs", 10

177 ),

178 "subscription_batch_size": self._get_setting(

179 "news.scheduler.batch_size", 5

180 ),

181 "activity_check_interval_minutes": self._get_setting(

182 "news.scheduler.activity_check_interval", 5

183 ),

184 }

185 log_settings(self.config, "Scheduler configuration loaded")

186 except Exception:

187 logger.exception("Error loading scheduler settings")

188 # Keep default config

189

190 def _get_setting(self, key: str, default: Any) -> Any:

191 """Get setting with fallback to default."""

192 if hasattr(self, "settings_manager") and self.settings_manager:

193 return self.settings_manager.get_setting(key, default=default)

194 return default

195

196 def set_app(self, app) -> None:

197 """Store a reference to the Flask app for creating app contexts in background jobs."""

198 self._app = app

199

200 def _wrap_job(self, func: Callable) -> Callable:

201 """Wrap a scheduler job function so it runs inside a Flask app context.

202

203 APScheduler runs jobs in a thread pool without Flask context.

204 This wrapper pushes an app context before the job runs and pops it after.

205 """

206

207 @wraps(func)

208 def wrapper(*args, **kwargs):

209 if self._app is not None:

210 with self._app.app_context():

211 return func(*args, **kwargs)

212 else:

213 logger.warning(

214 f"No Flask app set on scheduler; running {func.__name__} without app context"

215 )

216 return func(*args, **kwargs)

217

218 return wrapper

219

220 def _get_document_scheduler_settings(

221 self, username: str, force_refresh: bool = False

222 ) -> DocumentSchedulerSettings:

223 """

224 Get document scheduler settings for a user with TTL caching.

225

226 This is the single source of truth for document scheduler settings.

227 Settings are cached for 5 minutes by default to reduce database queries.

228

229 Args:

230 username: User to get settings for

231 force_refresh: If True, bypass cache and fetch fresh settings

232

233 Returns:

234 DocumentSchedulerSettings dataclass (frozen/immutable for thread-safety)

235 """

236 # Fast path: check cache without modifying it

237 if not force_refresh:

238 with self._settings_cache_lock:

239 cached = self._settings_cache.get(username)

240 if cached is not None:

241 logger.debug(f"[SETTINGS_CACHE] Cache hit for {username}")

242 cached_settings: DocumentSchedulerSettings = cached

243 return cached_settings

244

245 # Cache miss - need to fetch from database

246 logger.debug(

247 f"[SETTINGS_CACHE] Cache miss for {username}, fetching from DB"

248 )

249

250 # Get password from session

251 session_info = self.user_sessions.get(username)

252 if not session_info:

253 logger.warning(

254 f"[SETTINGS_CACHE] No session info for {username}, using defaults"

255 )

256 return DocumentSchedulerSettings.defaults()

257

258 password = self._credential_store.retrieve(username)

259 if not password: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 logger.warning(

261 f"[SETTINGS_CACHE] Credentials expired for {username}, using defaults"

262 )

263 return DocumentSchedulerSettings.defaults()

264

265 # Fetch settings from database (outside lock to avoid blocking)

266 try:

267 from ..database.session_context import get_user_db_session

268 from ..settings.manager import SettingsManager

269

270 with get_user_db_session(username, password) as db:

271 sm = SettingsManager(db)

272

273 settings = DocumentSchedulerSettings(

274 enabled=sm.get_setting("document_scheduler.enabled", True),

275 interval_seconds=sm.get_setting(

276 "document_scheduler.interval_seconds", 1800

277 ),

278 download_pdfs=sm.get_setting(

279 "document_scheduler.download_pdfs", False

280 ),

281 extract_text=sm.get_setting(

282 "document_scheduler.extract_text", True

283 ),

284 generate_rag=sm.get_setting(

285 "document_scheduler.generate_rag", False

286 ),

287 sweep_library_collections=sm.get_setting(

288 "document_scheduler.sweep_library_collections", False

289 ),

290 last_run=sm.get_setting("document_scheduler.last_run", ""),

291 )

292

293 # Store in cache

294 with self._settings_cache_lock:

295 self._settings_cache[username] = settings

296 logger.debug(f"[SETTINGS_CACHE] Cached settings for {username}")

297

298 return settings

299

300 except Exception as e:

301 # ``password`` (retrieved above) is in the frame locals of

302 # this handler and of ``get_user_db_session``; a traceback

303 # rendered with diagnose=True would leak it. Drop the

304 # traceback chain and redact str(e).

305 safe_msg = redact_secrets(str(e), password)

306 logger.warning(

307 f"[SETTINGS_CACHE] Error fetching settings for {username}: {safe_msg}"

308 )

309 return DocumentSchedulerSettings.defaults()

310

311 def invalidate_user_settings_cache(self, username: str) -> bool:

312 """

313 Invalidate cached settings for a specific user.

314

315 Call this when user settings change or user logs out.

316

317 Args:

318 username: User whose cache to invalidate

319

320 Returns:

321 True if cache entry was removed, False if not found

322 """

323 with self._settings_cache_lock:

324 if username in self._settings_cache:

325 del self._settings_cache[username]

326 logger.debug(

327 f"[SETTINGS_CACHE] Invalidated cache for {username}"

328 )

329 return True

330 return False

331

332 def invalidate_all_settings_cache(self) -> int:

333 """

334 Invalidate all cached settings.

335

336 Call this when doing bulk settings updates or during config reload.

337

338 Returns:

339 Number of cache entries cleared

340 """

341 with self._settings_cache_lock:

342 count = len(self._settings_cache)

343 self._settings_cache.clear()

344 logger.info(

345 f"[SETTINGS_CACHE] Cleared all settings cache ({count} entries)"

346 )

347 return count

348

349 def start(self):

350 """Start the scheduler."""

351 if not self.config.get("enabled", True):

352 logger.info("News scheduler is disabled in settings")

353 return

354

355 if self.is_running:

356 logger.warning("Scheduler is already running")

357 return

358

359 if self._app is None: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 raise RuntimeError(

361 "BackgroundJobScheduler.set_app() must be called before start()"

362 )

363

364 # Schedule cleanup job

365 self.scheduler.add_job(

366 self._wrap_job(self._run_cleanup_with_tracking),

367 "interval",

368 hours=self.config["cleanup_interval_hours"],

369 id="cleanup_inactive_users",

370 name="Cleanup Inactive User Sessions",

371 jitter=60, # Add some jitter to cleanup

372 )

373

374 # Schedule configuration reload

375 self.scheduler.add_job(

376 self._wrap_job(self._reload_config),

377 "interval",

378 minutes=30,

379 id="reload_config",

380 name="Reload Configuration",

381 )

382

383 # Start the scheduler

384 self.scheduler.start()

385 self.is_running = True

386

387 # Schedule initial cleanup after a delay

388 self.scheduler.add_job(

389 self._wrap_job(self._run_cleanup_with_tracking),

390 "date",

391 run_date=datetime.now(UTC) + timedelta(seconds=30),

392 id="initial_cleanup",

393 )

394

395 logger.info("News scheduler started")

396

397 def stop(self):

398 """Stop the scheduler."""

399 if self.is_running:

400 self.scheduler.shutdown(wait=True)

401 self.is_running = False

402

403 # Clear all user sessions and credentials

404 with self.lock:

405 for username in self.user_sessions:

406 self._credential_store.clear(username)

407 self.user_sessions.clear()

408

409 logger.info("News scheduler stopped")

410

411 def update_user_info(self, username: str, password: str):

412 """

413 Update user info in scheduler. Called on every database interaction.

414

415 Args:

416 username: User's username

417 password: User's password

418 """

419 logger.info(

420 f"[SCHEDULER] update_user_info called for {username}, is_running={self.is_running}, active_users={len(self.user_sessions)}"

421 )

422 logger.debug(

423 f"[SCHEDULER] Current active users: {list(self.user_sessions.keys())}"

424 )

425

426 if not self.is_running:

427 logger.warning(

428 f"[SCHEDULER] Scheduler not running, cannot update user {username}"

429 )

430 return

431

432 with self.lock:

433 # Store password in credential store (inside lock to prevent

434 # race where concurrent calls leave mismatched credentials)

435 self._credential_store.store(username, password)

436

437 now = datetime.now(UTC)

438

439 if username not in self.user_sessions:

440 # New user - create session info

441 logger.info(f"[SCHEDULER] New user in scheduler: {username}")

442 self.user_sessions[username] = {

443 "last_activity": now,

444 "scheduled_jobs": set(),

445 }

446 logger.debug(

447 f"[SCHEDULER] Created session for {username}, scheduling subscriptions"

448 )

449 # Schedule their subscriptions

450 self._schedule_user_subscriptions(username)

451 else:

452 # Existing user - update info

453 logger.info(

454 f"[SCHEDULER] Updating existing user {username} activity, will reschedule"

455 )

456 old_activity = self.user_sessions[username]["last_activity"]

457 activity_delta = now - old_activity

458 logger.debug(

459 f"[SCHEDULER] User {username} last activity: {old_activity}, delta: {activity_delta}"

460 )

461

462 self.user_sessions[username]["last_activity"] = now

463 logger.debug(

464 f"[SCHEDULER] Updated {username} session info, scheduling subscriptions"

465 )

466 # Reschedule their subscriptions in case they changed

467 self._schedule_user_subscriptions(username)

468

469 def unregister_user(self, username: str):

470 """

471 Unregister a user and clean up their scheduled jobs.

472 Called when user logs out.

473 """

474 with self.lock:

475 if username in self.user_sessions:

476 logger.info(f"Unregistering user {username}")

477

478 # Remove all scheduled jobs for this user

479 session_info = self.user_sessions[username]

480 for job_id in session_info["scheduled_jobs"].copy():

481 try:

482 self.scheduler.remove_job(job_id)

483 except JobLookupError:

484 pass

485

486 # Remove user session and clear credentials atomically

487 del self.user_sessions[username]

488 self._credential_store.clear(username)

489

490 # Invalidate settings cache for this user (outside lock)

491 self.invalidate_user_settings_cache(username)

492 logger.info(f"User {username} unregistered successfully")

493

494 def reschedule_document_jobs(self, username: str) -> bool:

495 """(Re)schedule the document-processing + reconciler jobs for an

496 ACTIVE user against their current settings.

497

498 Call this after a ``document_scheduler.*`` setting changes so the

499 change takes effect on the next interval tick instead of only after

500 the user logs out and back in. Without it, ``_schedule_reconciler``

501 runs only via the login path (``update_user_info`` ->

502 ``_schedule_user_subscriptions`` -> ``_schedule_document_processing``):

503 the runtime gate inside ``_reconcile_unindexed_documents`` neutralises

504 a stale job after a *disable*, but an *enable* (including toggling the

505 legacy ``generate_rag`` arm, which on older builds took effect on the

506 next tick) would otherwise never create the ``{username}_library_sweep``

507 job until the next login.

508

509 Relies on the cache having already been invalidated by the caller

510 (``invalidate_settings_caches``) so ``_schedule_document_processing``

511 re-reads fresh settings. No password argument is needed: the job is

512 rebuilt from the credentials the scheduler already holds for the active

513 session. Returns ``True`` if a reschedule was performed, ``False`` for

514 a user the scheduler isn't tracking (their jobs are built from current

515 settings on their next login) or when the scheduler isn't running.

516 """

517 if not self.is_running:

518 return False

519 with self.lock:

520 if username not in self.user_sessions:

521 logger.debug(

522 f"[DOC_SCHEDULER] reschedule_document_jobs: {username} "

523 "not an active scheduler session; skipping"

524 )

525 return False

526 self.user_sessions[username]["last_activity"] = datetime.now(UTC)

527 # Re-reads fresh settings (cache invalidated by the caller) and

528 # re-adds the document-processing job + reconciler per current

529 # settings; both use replace_existing=True so this is idempotent.

530 self._schedule_document_processing(username)

531 return True

532

533 def reschedule_zotero_jobs(self, username: str) -> bool:

534 """(Re)schedule the Zotero auto-sync job for an ACTIVE user against

535 their current settings.

536

537 Call this after a ``zotero.*`` setting changes (e.g. toggling

538 ``auto_sync_enabled`` or changing ``sync_interval_minutes``) so the

539 change takes effect on the next tick instead of only after the user logs

540 out and back in — otherwise ``_schedule_zotero_sync`` runs only via the

541 login path (``update_user_info`` -> ``_schedule_user_subscriptions``).

542 ``_schedule_zotero_sync`` already removes any existing job and no-ops

543 when auto-sync is disabled, so this both creates and tears down as

544 needed and is idempotent (``replace_existing=True``).

545

546 Relies on the caller having invalidated settings caches first so

547 ``get_config()`` re-reads fresh settings. Returns ``True`` if a

548 reschedule was performed, ``False`` for a user the scheduler isn't

549 tracking (built from current settings on their next login) or when the

550 scheduler isn't running.

551 """

552 if not self.is_running:

553 return False

554 with self.lock:

555 if username not in self.user_sessions:

556 logger.debug(

557 f"[ZOTERO_SCHEDULER] reschedule_zotero_jobs: {username} "

558 "not an active scheduler session; skipping"

559 )

560 return False

561 self.user_sessions[username]["last_activity"] = datetime.now(UTC)

562 self._schedule_zotero_sync(username)

563 return True

564

565 def _schedule_user_subscriptions(self, username: str):

566 """Schedule all active subscriptions for a user."""

567 logger.info(f"_schedule_user_subscriptions called for {username}")

568 # Pre-declared so the leak-redaction in the except handler is safe

569 # if the exception fires before ``password`` is assigned below.

570 password = None

571 try:

572 session_info = self.user_sessions.get(username)

573 if not session_info:

574 logger.warning(f"No session info found for {username}")

575 return

576

577 password = self._credential_store.retrieve(username)

578 if not password: 578 ↛ 579line 578 didn't jump to line 579 because the condition on line 578 was never true

579 logger.warning(

580 f"Credentials expired for {username}, skipping subscription scheduling"

581 )

582 return

583 logger.debug(f"Got password for {username}: present")

584

585 # Get user's subscriptions from their encrypted database

586 from ..database.session_context import get_user_db_session

587 from ..database.models.news import NewsSubscription

588

589 with get_user_db_session(username, password) as db:

590 subscriptions = (

591 db.query(NewsSubscription)

592 .filter(NewsSubscription.active_filter())

593 .all()

594 )

595 logger.debug(

596 f"Query executed, found {len(subscriptions)} results"

597 )

598

599 # Log details of each subscription

600 for sub in subscriptions:

601 logger.debug(

602 f"Subscription {sub.id}: name='{sub.name}', status='{sub.status}', refresh_interval={sub.refresh_interval_minutes} minutes"

603 )

604

605 logger.info(

606 f"Found {len(subscriptions)} active subscriptions for {username}"

607 )

608

609 # Clear old jobs for this user

610 for job_id in session_info["scheduled_jobs"].copy():

611 try:

612 self.scheduler.remove_job(job_id)

613 session_info["scheduled_jobs"].remove(job_id)

614 except JobLookupError:

615 pass

616

617 # Schedule each subscription with jitter

618 for sub in subscriptions:

619 job_id = f"{username}_{sub.id}"

620

621 # Calculate jitter

622 # Security: random jitter to distribute subscription timing, not security-sensitive

623 max_jitter = int(self.config.get("max_jitter_seconds", 300))

624 jitter = random.randint(0, max_jitter)

625

626 # Determine trigger based on frequency

627 refresh_minutes = sub.refresh_interval_minutes

628

629 if refresh_minutes <= 60: # 60 minutes or less

630 # For hourly or more frequent, use interval trigger

631 trigger = "interval"

632 trigger_args = {

633 "minutes": refresh_minutes,

634 "jitter": jitter,

635 "start_date": datetime.now(UTC), # Start immediately

636 }

637 else:

638 # For less frequent, calculate next run time

639 now = datetime.now(UTC)

640 if sub.next_refresh:

641 # Ensure timezone-aware for comparison with now (UTC)

642 next_refresh_aware = sub.next_refresh

643 if next_refresh_aware.tzinfo is None: 643 ↛ 651line 643 didn't jump to line 651 because the condition on line 643 was always true

644 logger.warning(

645 f"Subscription {sub.id} has naive (non-tz-aware) "

646 f"next_refresh datetime, assuming UTC"

647 )

648 next_refresh_aware = next_refresh_aware.replace(

649 tzinfo=UTC

650 )

651 if next_refresh_aware <= now: 651 ↛ 653line 651 didn't jump to line 653 because the condition on line 651 was never true

652 # Subscription is overdue - run it immediately with small jitter

653 logger.info(

654 f"Subscription {sub.id} is overdue, scheduling immediate run"

655 )

656 next_run = now + timedelta(seconds=jitter)

657 else:

658 next_run = next_refresh_aware

659 else:

660 next_run = now + timedelta(

661 minutes=refresh_minutes, seconds=jitter

662 )

663

664 trigger = "date"

665 trigger_args = {"run_date": next_run}

666

667 # Add the job

668 self.scheduler.add_job(

669 func=self._wrap_job(self._check_subscription),

670 args=[username, sub.id],

671 trigger=trigger,

672 id=job_id,

673 name=f"Check {sub.name or sub.query_or_topic[:30]}",

674 replace_existing=True,

675 **trigger_args,

676 )

677

678 session_info["scheduled_jobs"].add(job_id)

679 logger.info(f"Scheduled job {job_id} with {trigger} trigger")

680

681 except Exception as e:

682 # ``password`` was retrieved from the credential store

683 # above (line ~483) and passed into ``get_user_db_session``.

684 # An exception from the DB session (e.g. SQLCipher

685 # ``OperationalError``) can carry frame locals that include

686 # the plaintext SQLCipher master password — which is

687 # unrecoverable (TRUST.md §5). Drop the traceback chain and

688 # redact str(e).

689 safe_msg = redact_secrets(str(e), password)

690 logger.warning(

691 f"Error scheduling subscriptions for {username}: {safe_msg}"

692 )

693

694 # Add document processing for this user

695 self._schedule_document_processing(username)

696

697 # Add Zotero auto-sync for this user (no-op unless enabled)

698 self._schedule_zotero_sync(username)

699

700 def _schedule_document_processing(self, username: str):

701 """Schedule document processing for a user."""

702 logger.info(

703 f"[DOC_SCHEDULER] Scheduling document processing for {username}"

704 )

705 logger.debug(

706 f"[DOC_SCHEDULER] Current user sessions: {list(self.user_sessions.keys())}"

707 )

708

709 try:

710 session_info = self.user_sessions.get(username)

711 if not session_info:

712 logger.warning(

713 f"[DOC_SCHEDULER] No session info found for {username}"

714 )

715 logger.debug(

716 f"[DOC_SCHEDULER] Available sessions: {list(self.user_sessions.keys())}"

717 )

718 return

719

720 logger.debug(

721 f"[DOC_SCHEDULER] Retrieved session for {username}, scheduler running: {self.is_running}"

722 )

723

724 # Get user's document scheduler settings (cached)

725 settings = self._get_document_scheduler_settings(username)

726

727 if not settings.enabled:

728 logger.info(

729 f"[DOC_SCHEDULER] Document scheduler disabled for user {username}"

730 )

731 return

732

733 logger.info(

734 f"[DOC_SCHEDULER] User {username} document settings: enabled={settings.enabled}, "

735 f"interval={settings.interval_seconds}s, pdfs={settings.download_pdfs}, "

736 f"text={settings.extract_text}, "

737 f"index={settings.generate_rag or settings.sweep_library_collections}"

738 )

739

740 # Schedule document processing job

741 job_id = f"{username}_document_processing"

742 logger.debug(f"[DOC_SCHEDULER] Preparing to schedule job {job_id}")

743

744 # Remove existing document job if any

745 try:

746 self.scheduler.remove_job(job_id)

747 session_info["scheduled_jobs"].discard(job_id)

748 logger.debug(f"[DOC_SCHEDULER] Removed existing job {job_id}")

749 except JobLookupError:

750 logger.debug(

751 f"[DOC_SCHEDULER] No existing job {job_id} to remove"

752 )

753 pass # Job doesn't exist, that's fine

754

755 # Add new document processing job

756 logger.debug(

757 f"[DOC_SCHEDULER] Adding new document processing job with interval {settings.interval_seconds}s"

758 )

759 self.scheduler.add_job(

760 func=self._wrap_job(self._process_user_documents),

761 args=[username],

762 trigger="interval",

763 seconds=settings.interval_seconds,

764 id=job_id,

765 name=f"Process Documents for {username}",

766 jitter=30, # Add small jitter to prevent multiple users from processing simultaneously

767 max_instances=1, # Prevent overlapping document processing for same user

768 replace_existing=True,

769 )

770

771 session_info["scheduled_jobs"].add(job_id)

772 logger.info(

773 f"[DOC_SCHEDULER] Scheduled document processing job {job_id} for {username} with {settings.interval_seconds}s interval"

774 )

775 logger.debug(

776 f"[DOC_SCHEDULER] User {username} now has {len(session_info['scheduled_jobs'])} scheduled jobs: {list(session_info['scheduled_jobs'])}"

777 )

778

779 # Verify job was added

780 job = self.scheduler.get_job(job_id)

781 if job:

782 logger.info(

783 f"[DOC_SCHEDULER] Successfully verified job {job_id} exists, next run: {job.next_run_time}"

784 )

785 else:

786 logger.error(

787 f"[DOC_SCHEDULER] Failed to verify job {job_id} exists!"

788 )

789

790 # Schedule (or tear down) the unindexed-document reconciler,

791 # mirroring the document-processing job's lifecycle.

792 self._schedule_reconciler(username, settings, session_info)

793

794 except Exception as e:

795 # No ``password`` local here, but the caller frame

796 # (``_schedule_user_subscriptions``) holds the SQLCipher

797 # master password — loguru ``diagnose=True`` walks the

798 # frame stack and would render that caller-frame local.

799 # Drop the traceback by using ``logger.warning`` without

800 # ``exc_info``. ``redact_secrets`` with ``None`` is a no-op

801 # here, but kept for the check-sensitive-logging pre-commit

802 # hook + as a guide-post pairing for future refactors that

803 # might bring a password into scope.

804 safe_msg = redact_secrets(str(e), None)

805 logger.warning(

806 f"Error scheduling document processing for {username}: {safe_msg}"

807 )

808

809 def _schedule_reconciler(

810 self,

811 username: str,

812 settings: DocumentSchedulerSettings,

813 session_info: Dict[str, Any],

814 ) -> None:

815 """Add or remove the unindexed-document reconciler job.

816

817 Mirrors the document-processing job lifecycle in

818 ``_schedule_document_processing``: the job is (re)created only when

819 EITHER ``sweep_library_collections`` OR ``generate_rag`` is enabled, and

820 removed (and dropped from the session's tracked-jobs set) when both are

821 off — so toggling the settings off and rescheduling tears the job down

822 cleanly. The reconciler indexes every unindexed document (uploaded

823 library docs AND research downloads), so both settings gate it: the

824 ``generate_rag`` OR-arm preserves the legacy "index research downloads"

825 behaviour that used to live inline in ``_process_user_documents``.

826 """

827 job_id = f"{username}_library_sweep"

828

829 # Always remove any existing instance first so a disabled setting

830 # tears the job down and a changed interval is re-applied.

831 try:

832 self.scheduler.remove_job(job_id)

833 session_info["scheduled_jobs"].discard(job_id)

834 logger.debug(f"[RECONCILER] Removed existing job {job_id}")

835 except JobLookupError:

836 pass # Job doesn't exist, that's fine

837

838 if not (settings.sweep_library_collections or settings.generate_rag):

839 logger.debug(

840 f"[RECONCILER] Indexing disabled for {username}; not scheduling"

841 )

842 return

843

844 self.scheduler.add_job(

845 func=self._wrap_job(self._reconcile_unindexed_documents),

846 args=[username],

847 trigger="interval",

848 seconds=settings.interval_seconds,

849 id=job_id,

850 name=f"Unindexed Document Reconciler for {username}",

851 jitter=60,

852 max_instances=1, # Self-rate-limit: no overlapping runs

853 replace_existing=True,

854 )

855 session_info["scheduled_jobs"].add(job_id)

856 logger.info(

857 f"[RECONCILER] Scheduled unindexed-document reconciler job {job_id} "

858 f"for {username} with {settings.interval_seconds}s interval"

859 )

860

861 def _arm_egress_backstop(self, settings_manager, username: str) -> None:

862 """Set the audit-hook egress context from the user's saved settings so

863 scheduled document downloads run under the same secondary net as an

864 interactive research run. Best-effort and never raises — a backstop

865 failure must not break the scheduler; the DownloadService PEP remains

866 the primary gate. Cleared by the caller's @thread_cleanup on exit.

867 """

868 try:

869 from ..security.egress.audit_hook import set_active_context

870 from ..security.egress.policy import context_from_snapshot

871

872 snapshot = settings_manager.get_settings_snapshot()

873 if not isinstance(snapshot, dict):

874 return

875 primary = settings_manager.get_setting(

876 "search.tool", DEFAULT_SEARCH_TOOL

877 )

878 ctx = context_from_snapshot(

879 snapshot, primary or DEFAULT_SEARCH_TOOL, username=username

880 )

881 set_active_context(ctx)

882 except Exception:

883 logger.bind(policy_audit=True).debug(

884 "doc scheduler: egress backstop not armed", exc_info=True

885 )

886

887 @thread_cleanup

888 def _process_user_documents(self, username: str):

889 """Process documents for a user."""

890 logger.info(f"[DOC_SCHEDULER] Processing documents for user {username}")

891 start_time = datetime.now(UTC)

892

893 # Pre-declared so the except handlers can pass it to redact_secrets

894 # even if the retrieve() call below itself raises.

895 password = None

896 try:

897 session_info = self.user_sessions.get(username)

898 if not session_info:

899 logger.warning(

900 f"[DOC_SCHEDULER] No session info found for user {username}"

901 )

902 return

903

904 password = self._credential_store.retrieve(username)

905 if not password: 905 ↛ 906line 905 didn't jump to line 906 because the condition on line 905 was never true

906 logger.warning(

907 f"[DOC_SCHEDULER] Credentials expired for user {username}"

908 )

909 return

910 logger.debug(

911 f"[DOC_SCHEDULER] Starting document processing for {username}"

912 )

913

914 # Get user's document scheduler settings (cached)

915 settings = self._get_document_scheduler_settings(username)

916

917 logger.info(

918 f"[DOC_SCHEDULER] Processing settings for {username}: "

919 f"pdfs={settings.download_pdfs}, text={settings.extract_text}"

920 )

921

922 # RAG indexing has moved to ``_reconcile_unindexed_documents`` (its

923 # own scheduled job), so ``generate_rag`` no longer drives any work

924 # in this download/extract pass. Only the file-producing passes gate

925 # whether this method runs.

926 if not any(

927 [

928 settings.download_pdfs,

929 settings.extract_text,

930 ]

931 ):

932 logger.info(

933 f"[DOC_SCHEDULER] No download/extract options enabled for user {username}"

934 )

935 return

936

937 # Parse last_run from cached settings

938 last_run = (

939 datetime.fromisoformat(settings.last_run)

940 if settings.last_run

941 else None

942 )

943

944 logger.info(f"[DOC_SCHEDULER] Last run for {username}: {last_run}")

945

946 # Need database session for queries and updates

947 from ..database.session_context import get_user_db_session

948 from ..database.models.research import ResearchHistory

949 from ..settings.manager import SettingsManager

950

951 with get_user_db_session(username, password) as db:

952 settings_manager = SettingsManager(db)

953

954 # Arm the PEP-578 audit-hook backstop for this scheduled run.

955 # The APScheduler worker thread carries no egress context, so

956 # the secondary net would be inactive while DownloadService

957 # fetches documents below. DownloadService's evaluate_url PEP

958 # still gates each fetch (primary); this restores defense-in-

959 # depth parity with an interactive run. @thread_cleanup clears

960 # the context when this method returns.

961 self._arm_egress_backstop(settings_manager, username)

962

963 # Query for completed research since last run

964 logger.debug(

965 f"[DOC_SCHEDULER] Querying for completed research since {last_run}"

966 )

967 query = db.query(ResearchHistory).filter(

968 ResearchHistory.status == ResearchStatus.COMPLETED,

969 ResearchHistory.completed_at.is_not(

970 None

971 ), # Ensure completed_at is not null

972 )

973

974 if last_run:

975 query = query.filter(

976 ResearchHistory.completed_at > last_run

977 )

978

979 # Limit to recent research to prevent overwhelming

980 query = query.order_by(

981 ResearchHistory.completed_at.desc()

982 ).limit(20)

983

984 research_sessions = query.all()

985 logger.debug(

986 f"[DOC_SCHEDULER] Query executed, found {len(research_sessions)} sessions"

987 )

988

989 if not research_sessions:

990 logger.info(

991 f"[DOC_SCHEDULER] No new completed research sessions found for user {username}"

992 )

993 return

994

995 logger.info(

996 f"[DOC_SCHEDULER] Found {len(research_sessions)} research sessions to process for {username}"

997 )

998

999 # Log details of each research session

1000 for i, research in enumerate(

1001 research_sessions[:5]

1002 ): # Log first 5 details

1003 title_safe = (

1004 (research.title[:50] + "...")

1005 if research.title

1006 else "No title"

1007 )

1008 completed_safe = (

1009 research.completed_at

1010 if research.completed_at

1011 else "No completion time"

1012 )

1013 logger.debug(

1014 f"[DOC_SCHEDULER] Session {i + 1}: id={research.id}, title={title_safe}, completed={completed_safe}"

1015 )

1016

1017 # Handle completed_at which might be a string or datetime

1018 completed_at_obj = None

1019 if research.completed_at:

1020 if isinstance(research.completed_at, str):

1021 try:

1022 completed_at_obj = datetime.fromisoformat(

1023 research.completed_at.replace("Z", "+00:00")

1024 )

1025 except (ValueError, TypeError, AttributeError):

1026 completed_at_obj = None

1027 else:

1028 completed_at_obj = research.completed_at

1029

1030 logger.debug(

1031 f"[DOC_SCHEDULER] - completed_at type: {type(research.completed_at)}"

1032 )

1033 logger.debug(

1034 f"[DOC_SCHEDULER] - completed_at timezone: {completed_at_obj.tzinfo if completed_at_obj else 'None'}"

1035 )

1036 logger.debug(f"[DOC_SCHEDULER] - last_run: {last_run}")

1037 logger.debug(

1038 f"[DOC_SCHEDULER] - completed_at > last_run: {completed_at_obj > last_run if last_run and completed_at_obj else 'N/A'}"

1039 )

1040

1041 # Capture a settings snapshot for this user/run so the

1042 # DownloadService below can build an EgressContext and

1043 # gate each per-resource URL. Without this the scheduler

1044 # would bypass policy entirely. Reuses the outer `db`

1045 # session (line 743) — get_settings_manager() in a

1046 # background thread must be passed a db_session

1047 # explicitly per the pre-commit thread-safety check.

1048 try:

1049 user_settings_snapshot = (

1050 settings_manager.get_settings_snapshot()

1051 )

1052 except Exception as e:

1053 # ``password`` is live in this frame (it opened the

1054 # surrounding ``get_user_db_session``). Drop traceback

1055 # + redact str(e) to avoid leaking the SQLCipher

1056 # master password.

1057 safe_msg = redact_secrets(str(e), password)

1058 logger.warning(

1059 f"[DOC_SCHEDULER] Could not build settings snapshot: "

1060 f"{safe_msg} — downloads will not be scope-gated"

1061 )

1062 user_settings_snapshot = None

1063

1064 processed_count = 0

1065 for research in research_sessions:

1066 try:

1067 logger.info(

1068 f"[DOC_SCHEDULER] Processing research {research.id} for user {username}"

1069 )

1070

1071 # Set search context so rate limiting works in both

1072 # download_pdfs and extract_text paths

1073 from ..utilities.thread_context import (

1074 set_search_context,

1075 )

1076

1077 set_search_context(

1078 {

1079 "research_id": str(research.id),

1080 "username": username,

1081 "user_password": password,

1082 "research_phase": "document_scheduler",

1083 }

1084 )

1085

1086 # Call actual processing APIs

1087 if settings.download_pdfs:

1088 logger.info(

1089 f"[DOC_SCHEDULER] Downloading PDFs for research {research.id}"

1090 )

1091 try:

1092 # Use the DownloadService to queue PDF downloads

1093 from ..research_library.services.download_service import (

1094 DownloadService,

1095 )

1096

1097 with DownloadService(

1098 username,

1099 password,

1100 settings_snapshot=user_settings_snapshot,

1101 ) as download_service:

1102 queued_count = download_service.queue_research_downloads(

1103 research.id

1104 )

1105 logger.info(

1106 f"[DOC_SCHEDULER] Queued {queued_count} PDF downloads for research {research.id}"

1107 )

1108 except Exception as e:

1109 # Recover the shared thread-local session

1110 # before continuing — without rollback the

1111 # next phase (text extract / RAG) and the

1112 # post-loop last_run commit run on a

1113 # poisoned session (issue #3827).

1114 safe_rollback(db, "DOC_SCHEDULER PDF download")

1115 # ``password`` is in scope and was passed

1116 # into ``DownloadService``. Drop traceback

1117 # + redact str(e) to avoid leaking the

1118 # SQLCipher master password under

1119 # ``diagnose=True``.

1120 safe_msg = redact_secrets(str(e), password)

1121 logger.warning(

1122 f"[DOC_SCHEDULER] Failed to download PDFs for research {research.id}: {safe_msg}"

1123 )

1124

1125 if settings.extract_text:

1126 logger.info(

1127 f"[DOC_SCHEDULER] Extracting text for research {research.id}"

1128 )

1129 try:

1130 # Use the DownloadService to extract text for all resources

1131 from ..research_library.services.download_service import (

1132 DownloadService,

1133 )

1134 from ..database.models.research import (

1135 ResearchResource,

1136 )

1137

1138 from ..research_library.utils import (

1139 is_downloadable_url,

1140 )

1141

1142 with DownloadService(

1143 username,

1144 password,

1145 settings_snapshot=user_settings_snapshot,

1146 ) as download_service:

1147 # Get all resources for this research (reuse existing db session)

1148 all_resources = (

1149 db.query(ResearchResource)

1150 .filter_by(research_id=research.id)

1151 .all()

1152 )

1153 # Filter: only process downloadable resources (academic/PDF)

1154 resources = [

1155 r

1156 for r in all_resources

1157 if is_downloadable_url(r.url)

1158 ]

1159 processed_count = 0

1160 for resource in resources:

1161 # We need to pass the password to the download service

1162 # The DownloadService creates its own database sessions, so we need to ensure password is available

1163 try:

1164 success, error = (

1165 download_service.download_as_text(

1166 resource.id

1167 )

1168 )

1169 if success:

1170 processed_count += 1

1171 logger.info(

1172 f"[DOC_SCHEDULER] Successfully extracted text for resource {resource.id}"

1173 )

1174 else:

1175 logger.warning(

1176 f"[DOC_SCHEDULER] Failed to extract text for resource {resource.id}: {error}"

1177 )

1178 except Exception as resource_error:

1179 # Roll back FIRST so the next

1180 # iteration's queries don't

1181 # cascade on a poisoned session

1182 # (issue #3827).

1183 safe_rollback(

1184 db,

1185 "DOC_SCHEDULER resource",

1186 )

1187 # ``password`` is in scope and

1188 # was passed into the enclosing

1189 # ``DownloadService``. Drop the

1190 # traceback chain + redact str(e)

1191 # to avoid leaking the SQLCipher

1192 # master password.

1193 safe_msg = redact_secrets(

1194 str(resource_error), password

1195 )

1196 logger.warning(

1197 f"[DOC_SCHEDULER] Error processing resource {resource.id}: {safe_msg}"

1198 )

1199 logger.info(

1200 f"[DOC_SCHEDULER] Text extraction completed for research {research.id}: {processed_count}/{len(resources)} resources processed"

1201 )

1202 except Exception as e:

1203 safe_rollback(

1204 db, "DOC_SCHEDULER text extraction"

1205 )

1206 # ``password`` is in scope from the outer

1207 # ``_process_user_documents`` retrieval —

1208 # same redact + warning pattern as the

1209 # inner handlers in this function.

1210 safe_msg = redact_secrets(str(e), password)

1211 logger.warning(

1212 f"[DOC_SCHEDULER] Failed to extract text for research {research.id}: {safe_msg}"

1213 )

1214

1215 # NOTE: RAG indexing of research downloads used to live

1216 # here (the old ``if settings.generate_rag:`` block).

1217 # It has been retired — the unified

1218 # ``_reconcile_unindexed_documents`` reconciler now

1219 # indexes ALL unindexed documents (including research

1220 # downloads that have no DocumentCollection row yet) on

1221 # its own schedule, gated by ``generate_rag OR

1222 # sweep_library_collections``. The download_pdfs and

1223 # extract_text passes above remain here because they

1224 # produce the ``text_content`` the reconciler indexes.

1225

1226 processed_count += 1

1227 logger.debug(

1228 f"[DOC_SCHEDULER] Successfully queued processing for research {research.id}"

1229 )

1230

1231 except Exception as e:

1232 safe_rollback(db, "DOC_SCHEDULER research")

1233 # ``password`` is in scope from the outer

1234 # ``_process_user_documents`` retrieval. Drop the

1235 # traceback chain and redact str(e).

1236 safe_msg = redact_secrets(str(e), password)

1237 logger.warning(

1238 f"[DOC_SCHEDULER] Error processing research {research.id} for user {username}: {safe_msg}"

1239 )

1240

1241 # Update last run time in user's settings.

1242 # Intentionally NOT wrapped in try/finally: if upstream setup

1243 # fails (DB open, SettingsManager init, initial query),

1244 # last_run should stay put so the next tick retries.

1245 # Advancing here would mask a persistent failure (corrupted

1246 # DB, wrong password). See closed PR #3288.

1247 current_time = datetime.now(UTC).isoformat()

1248 settings_manager.set_setting(

1249 "document_scheduler.last_run", current_time, commit=True

1250 )

1251 logger.debug(

1252 f"[DOC_SCHEDULER] Updated last run time for {username} to {current_time}"

1253 )

1254

1255 end_time = datetime.now(UTC)

1256 duration = (end_time - start_time).total_seconds()

1257 logger.info(

1258 f"[DOC_SCHEDULER] Completed document processing for user {username}: {processed_count} sessions processed in {duration:.2f}s"

1259 )

1260

1261 except Exception as e:

1262 # ``password`` is pre-declared as ``None`` at the top of the

1263 # function, so it is always bound here even if the retrieve()

1264 # call itself raised. ``redact_secrets`` silently skips a

1265 # ``None`` secret. Drop the traceback chain.

1266 safe_msg = redact_secrets(str(e), password)

1267 logger.warning(

1268 f"[DOC_SCHEDULER] Error processing documents for user {username}: {safe_msg}"

1269 )

1270

1271 @thread_cleanup

1272 def _reconcile_unindexed_documents(self, username: str) -> None:

1273 """Unified background reconciler that indexes ANY unindexed document.

1274

1275 Self-healing follow-up to the immediate auto-index queue (PR #3939),

1276 which caps the queue and DROPS documents on saturation, AND replacement

1277 for the retired research-scoped ``generate_rag`` indexing block that

1278 used to live inline in ``_process_user_documents``. A single scheduled

1279 job now covers every unindexed document, so library uploads and research

1280 downloads can no longer be permanently missed.

1281

1282 Two cases are handled per tick, each with its OWN independent

1283 ``_LIBRARY_SWEEP_BATCH`` budget so the work done in a single thread tick

1284 stays bounded (total <= 2 x ``_LIBRARY_SWEEP_BATCH``). The budgets are

1285 decoupled on purpose: case (a) only marks a row indexed on SUCCESS, so a

1286 block of permanently-failing case-(a) rows must not be able to consume

1287 case (b)'s budget and starve the research-orphan path:

1288

1289 (a) In-collection unindexed: documents that already have a

1290 ``DocumentCollection`` link (e.g. manual uploads — ``upload_to_

1291 collection`` always creates the row) with ``indexed`` False and

1292 text content. Indexed via the per-collection RAG factory so each

1293 collection's own embedding config is honored.

1294 (b) Research orphans: ``Document`` rows with ``research_id`` set and

1295 text content that have NO ``DocumentCollection`` link in the default

1296 library collection yet (research downloads that were never

1297 ingested). ``index_document(doc_id, default_library_id, ...)`` calls

1298 ``ensure_in_collection`` internally, so it ingests + indexes in one

1299 call.

1300

1301 Behaviour:

1302 - Gated by EITHER ``sweep_library_collections`` OR ``generate_rag`` —

1303 the ``generate_rag`` arm preserves the legacy "index research

1304 downloads" behaviour. Off by default; early-returns when neither set.

1305 - Idempotent: uses ``index_document(..., force_reindex=False)`` and only

1306 selects rows that are not yet indexed, so already-indexed documents

1307 are never touched.

1308 - Self-rate-limited: each case is capped at ``_LIBRARY_SWEEP_BATCH``

1309 documents per tick (so total <= 2 x ``_LIBRARY_SWEEP_BATCH``); the job

1310 is scheduled with ``max_instances=1``.

1311 """

1312 logger.info(

1313 f"[RECONCILER] Starting unindexed-document reconcile for user {username}"

1314 )

1315

1316 # Pre-declared so the except handlers can pass it to redact_secrets

1317 # even if the retrieve() call below itself raises.

1318 password = None

1319 try:

1320 session_info = self.user_sessions.get(username)

1321 if not session_info: 1321 ↛ 1322line 1321 didn't jump to line 1322 because the condition on line 1321 was never true

1322 logger.warning(

1323 f"[RECONCILER] No session info found for user {username}"

1324 )

1325 return

1326

1327 password = self._credential_store.retrieve(username)

1328 if not password: 1328 ↛ 1329line 1328 didn't jump to line 1329 because the condition on line 1328 was never true

1329 logger.warning(

1330 f"[RECONCILER] Credentials expired for user {username}"

1331 )

1332 return

1333

1334 # Get user's document scheduler settings (cached).

1335 settings = self._get_document_scheduler_settings(username)

1336

1337 # Gate at runtime too, not just at scheduling: the already-live

1338 # APScheduler job keeps firing after the document scheduler is

1339 # disabled until the next reschedule, and the setting description

1340 # promises the sweep only runs while the scheduler is enabled.

1341 # OFF by default: runs only when the scheduler is enabled AND

1342 # EITHER opt-in is set. The generate_rag arm preserves the legacy

1343 # research-download indexing behaviour from _process_user_documents.

1344 if not settings.enabled or not (

1345 settings.sweep_library_collections or settings.generate_rag

1346 ):

1347 logger.debug(

1348 f"[RECONCILER] Indexing disabled for user {username}"

1349 )

1350 return

1351

1352 # Lazy import of the RAG factory. Imported here (not at module

1353 # top) to keep the import surface of this scheduler module small

1354 # and consistent with the other lazy imports in this file; the

1355 # factory itself has no import dependency on the scheduler so a

1356 # top-level import would also be safe.

1357 from ..research_library.services.rag_service_factory import (

1358 get_rag_service,

1359 )

1360

1361 from ..database.session_context import get_user_db_session

1362 from ..settings.manager import SettingsManager

1363

1364 with get_user_db_session(username, password) as db:

1365 settings_manager = SettingsManager(db)

1366

1367 # Arm the PEP-578 audit-hook backstop for this scheduled run,

1368 # mirroring _process_user_documents. Indexing itself doesn't

1369 # download, but embedding providers may make network calls;

1370 # this keeps defense-in-depth parity with an interactive run.

1371 # Cleared by @thread_cleanup on exit.

1372 self._arm_egress_backstop(settings_manager, username)

1373

1374 total_indexed = 0

1375

1376 # ---- Case (a): in-collection unindexed documents ----------

1377 # Bounded by _LIBRARY_SWEEP_BATCH so a large backlog self-heals

1378 # over successive ticks. Only rows with actual text content can

1379 # be indexed. RANDOMIZED selection (not a stable id order): a

1380 # row leaves this candidate set only on SUCCESS and we track no

1381 # per-row failure state, so a deterministic order would let a

1382 # block of permanently-failing low-id rows (e.g. empty-text /

1383 # scanned PDFs that always return an indexing error yet pass the

1384 # text_content IS NOT NULL filter) win the LIMIT slots every

1385 # tick and starve indexable higher-id rows forever. Random

1386 # sampling gives every indexable row a chance each tick, so

1387 # progress is eventually made despite a permanent-failure set.

1388 unindexed = (

1389 db.query(

1390 DocumentCollection.document_id,

1391 DocumentCollection.collection_id,

1392 )

1393 .join(

1394 Document,

1395 Document.id == DocumentCollection.document_id,

1396 )

1397 .filter(

1398 DocumentCollection.indexed.is_(False),

1399 Document.text_content.isnot(None),

1400 )

1401 .order_by(func.random())

1402 .limit(_LIBRARY_SWEEP_BATCH)

1403 .all()

1404 )

1405

1406 # Group document ids by collection so we build exactly one RAG

1407 # service per collection (each collection can have its own

1408 # embedding config).

1409 docs_by_collection: Dict[str, List[str]] = {}

1410 for doc_id, coll_id in unindexed:

1411 docs_by_collection.setdefault(coll_id, []).append(doc_id)

1412

1413 if unindexed:

1414 logger.info(

1415 f"[RECONCILER] Found {len(unindexed)} in-collection "

1416 f"unindexed document(s) across "

1417 f"{len(docs_by_collection)} collection(s) for {username}"

1418 )

1419

1420 for coll_id, doc_ids in docs_by_collection.items():

1421 try:

1422 # USE THE FACTORY so per-collection embedding settings

1423 # (model/provider/chunking/etc.) stored on the

1424 # collection are honored — get_rag_service loads them

1425 # from the collection row when collection_id is given.

1426 with get_rag_service(

1427 username,

1428 collection_id=coll_id,

1429 db_password=password,

1430 ) as rag_service:

1431 for doc_id in doc_ids:

1432 try:

1433 result = rag_service.index_document(

1434 document_id=doc_id,

1435 collection_id=coll_id,

1436 force_reindex=False,

1437 )

1438 if result.get("status") == "success": 1438 ↛ 1431line 1438 didn't jump to line 1431 because the condition on line 1438 was always true

1439 total_indexed += 1

1440 logger.debug(

1441 f"[RECONCILER] Indexed document {doc_id} "

1442 f"into collection {coll_id} with "

1443 f"{result.get('chunk_count', 0)} chunks"

1444 )

1445 except Exception as doc_error:

1446 # ``password`` is in scope and was passed

1447 # into ``get_rag_service``. Drop the

1448 # traceback chain + redact str(e) to avoid

1449 # leaking the SQLCipher master password.

1450 safe_msg = redact_secrets(

1451 str(doc_error), password

1452 )

1453 logger.warning(

1454 f"[RECONCILER] Failed to index document "

1455 f"{doc_id} into collection {coll_id}: {safe_msg}"

1456 )

1457 except Exception as coll_error:

1458 # Recover the shared thread-local session before moving

1459 # on to the next collection so its queries don't run on

1460 # a poisoned session.

1461 safe_rollback(db, "RECONCILER collection")

1462 safe_msg = redact_secrets(str(coll_error), password)

1463 logger.warning(

1464 f"[RECONCILER] Failed to index collection {coll_id}: {safe_msg}"

1465 )

1466

1467 # ---- Case (b): research orphans -> default library ---------

1468 # Research downloads land as Document rows (research_id set)

1469 # with NO DocumentCollection link yet. index_document()

1470 # ensure_in_collection's the default-library link, so this

1471 # ingests + indexes in one call.

1472 #

1473 # Case (b) gets its OWN independent _LIBRARY_SWEEP_BATCH budget

1474 # rather than the leftover of case (a). Case (a) only flips a

1475 # row to indexed=True on SUCCESS, so a block of permanently

1476 # failing case-(a) rows (empty text, embedding/FAISS errors,

1477 # PolicyDeniedError under egress denial) would otherwise fill

1478 # the LIMIT every tick, leave the leftover at 0, and starve the

1479 # research-orphan path forever — regressing the no-regression

1480 # promise for generate_rag-only users. Decoupling the budgets

1481 # caps total work at 2 x _LIBRARY_SWEEP_BATCH per tick, which is

1482 # acceptable. RANDOMIZED selection (see case (a)) so a block of

1483 # permanently-failing low-id orphans can't pin the LIMIT slots

1484 # every tick and starve the rest.

1485 #

1486 # Resolve the default library collection once.

1487 default_library_id = get_default_library_id(username, password)

1488

1489 orphans = (

1490 db.query(Document.id)

1491 .outerjoin(

1492 DocumentCollection,

1493 (DocumentCollection.document_id == Document.id)

1494 & (

1495 DocumentCollection.collection_id

1496 == default_library_id

1497 ),

1498 )

1499 .filter(

1500 Document.research_id.isnot(None),

1501 Document.text_content.isnot(None),

1502 DocumentCollection.id.is_(None),

1503 )

1504 .order_by(func.random())

1505 .limit(_LIBRARY_SWEEP_BATCH)

1506 .all()

1507 )

1508

1509 if orphans:

1510 logger.info(

1511 f"[RECONCILER] Found {len(orphans)} research "

1512 f"orphan document(s) to ingest into the default "

1513 f"library for {username}"

1514 )

1515 try:

1516 with get_rag_service(

1517 username,

1518 collection_id=default_library_id,

1519 db_password=password,

1520 ) as rag_service:

1521 for (doc_id,) in orphans:

1522 try:

1523 result = rag_service.index_document(

1524 document_id=doc_id,

1525 collection_id=default_library_id,

1526 force_reindex=False,

1527 )

1528 if result.get("status") == "success": 1528 ↛ 1521line 1528 didn't jump to line 1521 because the condition on line 1528 was always true

1529 total_indexed += 1

1530 logger.debug(

1531 f"[RECONCILER] Ingested + "

1532 f"indexed research orphan "

1533 f"{doc_id} into the default "

1534 f"library with "

1535 f"{result.get('chunk_count', 0)} chunks"

1536 )

1537 except Exception as doc_error:

1538 safe_msg = redact_secrets(

1539 str(doc_error), password

1540 )

1541 logger.warning(

1542 f"[RECONCILER] Failed to index "

1543 f"research orphan {doc_id}: {safe_msg}"

1544 )

1545 except Exception as orphan_error:

1546 safe_rollback(db, "RECONCILER orphans")

1547 safe_msg = redact_secrets(str(orphan_error), password)

1548 logger.warning(

1549 f"[RECONCILER] Failed to index research "

1550 f"orphans into the default library: {safe_msg}"

1551 )

1552

1553 logger.info(

1554 f"[RECONCILER] Completed reconcile for user {username}: "

1555 f"{total_indexed} document(s) indexed "

1556 f"(per-case batch cap {_LIBRARY_SWEEP_BATCH})"

1557 )

1558

1559 except Exception as e:

1560 # ``password`` is pre-declared as ``None`` at the top of the

1561 # function, so it is always bound here even if the retrieve()

1562 # call itself raised. ``redact_secrets`` silently skips a

1563 # ``None`` secret. Drop the traceback chain.

1564 safe_msg = redact_secrets(str(e), password)

1565 logger.warning(

1566 f"[RECONCILER] Error reconciling unindexed documents for user {username}: {safe_msg}"

1567 )

1568

1569 def get_document_scheduler_status(self, username: str) -> Dict[str, Any]:

1570 """Get document scheduler status for a specific user."""

1571 try:

1572 session_info = self.user_sessions.get(username)

1573 if not session_info:

1574 return {

1575 "enabled": False,

1576 "message": "User not found in scheduler",

1577 }

1578

1579 # Get user's document scheduler settings (cached)

1580 settings = self._get_document_scheduler_settings(username)

1581

1582 # Check if user has document processing job

1583 job_id = f"{username}_document_processing"

1584 has_job = job_id in session_info.get("scheduled_jobs", set())

1585

1586 return {

1587 "enabled": settings.enabled,

1588 "interval_seconds": settings.interval_seconds,

1589 "processing_options": {

1590 "download_pdfs": settings.download_pdfs,

1591 "extract_text": settings.extract_text,

1592 # generate_rag and sweep_library_collections both gate the

1593 # unified reconciler (_reconcile_unindexed_documents).

1594 "generate_rag": settings.generate_rag,

1595 "sweep_library_collections": settings.sweep_library_collections,

1596 },

1597 "last_run": settings.last_run,

1598 "has_scheduled_job": has_job,

1599 "user_active": username in self.user_sessions,

1600 }

1601

1602 except Exception as e:

1603 # No ``password`` local in this method, but caller frames

1604 # (e.g. a route handler that already retrieved the user's

1605 # password) could be rendered under loguru ``diagnose=True``.

1606 # Drop the traceback by using ``logger.warning`` without

1607 # ``exc_info``.

1608 safe_msg = redact_secrets(str(e), None)

1609 logger.warning(

1610 f"Error getting document scheduler status for user {username}: {safe_msg}"

1611 )

1612 return {

1613 "enabled": False,

1614 "message": f"Failed to retrieve scheduler status: {type(e).__name__}",

1615 }

1616

1617 def trigger_document_processing(self, username: str) -> bool:

1618 """Trigger immediate document processing for a user."""

1619 logger.info(

1620 f"[DOC_SCHEDULER] Manual trigger requested for user {username}"

1621 )

1622 try:

1623 session_info = self.user_sessions.get(username)

1624 if not session_info:

1625 logger.warning(

1626 f"[DOC_SCHEDULER] User {username} not found in scheduler"

1627 )

1628 logger.debug(

1629 f"[DOC_SCHEDULER] Available users: {list(self.user_sessions.keys())}"

1630 )

1631 return False

1632

1633 if not self.is_running:

1634 logger.warning(

1635 f"[DOC_SCHEDULER] Scheduler not running, cannot trigger document processing for {username}"

1636 )

1637 return False

1638

1639 # Trigger immediate processing

1640 job_id = f"{username}_document_processing_manual"

1641 logger.debug(f"[DOC_SCHEDULER] Scheduling manual job {job_id}")

1642

1643 self.scheduler.add_job(

1644 func=self._wrap_job(self._process_user_documents),

1645 args=[username],

1646 trigger="date",

1647 run_date=datetime.now(UTC) + timedelta(seconds=1),

1648 id=job_id,

1649 name=f"Manual Document Processing for {username}",

1650 replace_existing=True,

1651 )

1652

1653 # Verify job was added

1654 job = self.scheduler.get_job(job_id)

1655 if job:

1656 logger.info(

1657 f"[DOC_SCHEDULER] Successfully triggered manual document processing for user {username}, job {job_id}, next run: {job.next_run_time}"

1658 )

1659 else:

1660 logger.error(

1661 f"[DOC_SCHEDULER] Failed to verify manual job {job_id} was added!"

1662 )

1663 return False

1664

1665 return True

1666

1667 except Exception as e:

1668 # No ``password`` local in this method, but caller frames

1669 # could hold one — drop the traceback to avoid frame-local

1670 # rendering under ``diagnose=True``.

1671 safe_msg = redact_secrets(str(e), None)

1672 logger.warning(

1673 f"[DOC_SCHEDULER] Error triggering document processing for user {username}: {safe_msg}"

1674 )

1675 return False

1676

1677 # -- Zotero auto-sync -------------------------------------------------

1678

1679 def _schedule_zotero_sync(self, username: str):

1680 """Schedule (or refresh) the Zotero auto-sync job for a user.

1681

1682 No-op unless the user has enabled the Zotero integration *and*

1683 background auto-sync. Any existing job is removed first so toggling

1684 the setting off actually unschedules it.

1685 """

1686 # Pre-declared so the except handler can pass it to redact_secrets

1687 # even if retrieve()/get_config() raises — get_config() opens the

1688 # encrypted DB, so an error message could embed the SQLCipher key.

1689 password = None

1690 try:

1691 session_info = self.user_sessions.get(username)

1692 if not session_info: 1692 ↛ 1693line 1692 didn't jump to line 1693 because the condition on line 1692 was never true

1693 return

1694

1695 job_id = f"{username}_zotero_sync"

1696

1697 def _drop_existing_job():

1698 try:

1699 self.scheduler.remove_job(job_id)

1700 session_info["scheduled_jobs"].discard(job_id)

1701 except JobLookupError:

1702 pass

1703

1704 password = self._credential_store.retrieve(username)

1705 if not password:

1706 # Expired credentials: the job could never run anyway, and

1707 # disabling auto-sync in this state must still unschedule it —

1708 # otherwise it keeps ticking (and logging a warning) until

1709 # logout.

1710 _drop_existing_job()

1711 return

1712

1713 from ..research_library.zotero import ZoteroSyncService

1714

1715 cfg = ZoteroSyncService(username, password).get_config()

1716

1717 # Remove only AFTER the config read succeeded: a transient error

1718 # from the encrypted-DB read must leave a healthy existing job in

1719 # place (it would otherwise stay gone until the next login).

1720 # Accepted trade-off: if that error races a just-disabled toggle,

1721 # the stale job stays scheduled — but it is inert, because

1722 # _sync_user_zotero re-checks the config at every tick and no-ops

1723 # when auto-sync is disabled.

1724 _drop_existing_job()

1725

1726 if not (cfg.is_configured and cfg.auto_sync_enabled):

1727 logger.debug(

1728 f"[ZOTERO_SCHEDULER] Auto-sync not enabled for {username}"

1729 )

1730 return

1731

1732 interval_minutes = max(15, int(cfg.sync_interval_minutes or 360))

1733 self.scheduler.add_job(

1734 func=self._wrap_job(self._sync_user_zotero),

1735 args=[username],

1736 trigger="interval",

1737 minutes=interval_minutes,

1738 id=job_id,

1739 name=f"Zotero Sync for {username}",

1740 jitter=60,

1741 max_instances=1,

1742 replace_existing=True,

1743 )

1744 session_info["scheduled_jobs"].add(job_id)

1745 logger.info(

1746 f"[ZOTERO_SCHEDULER] Scheduled Zotero sync for {username} "

1747 f"every {interval_minutes} minutes"

1748 )

1749 except Exception as e:

1750 safe_msg = redact_secrets(str(e), password)

1751 logger.warning(

1752 f"[ZOTERO_SCHEDULER] Error scheduling Zotero sync for "

1753 f"{username}: {safe_msg}"

1754 )

1755

1756 @thread_cleanup

1757 def _sync_user_zotero(self, username: str):

1758 """Background job: run a Zotero sync for a user."""

1759 password = None

1760 try:

1761 session_info = self.user_sessions.get(username)

1762 if not session_info:

1763 return

1764

1765 password = self._credential_store.retrieve(username)

1766 if not password:

1767 logger.warning(

1768 f"[ZOTERO_SCHEDULER] Credentials expired for {username}"

1769 )

1770 return

1771

1772 from ..research_library.zotero import ZoteroSyncService

1773

1774 service = ZoteroSyncService(username, password)

1775 cfg = service.get_config()

1776 if not (cfg.is_configured and cfg.auto_sync_enabled):

1777 logger.debug(

1778 f"[ZOTERO_SCHEDULER] Auto-sync not enabled for {username}"

1779 )

1780 return

1781

1782 logger.info(

1783 f"[ZOTERO_SCHEDULER] Running Zotero sync for {username}"

1784 )

1785 result = service.sync_all()

1786 logger.info(

1787 f"[ZOTERO_SCHEDULER] Zotero sync for {username}: "

1788 f"imported={result.get('imported')}, "

1789 f"updated={result.get('updated')}, "

1790 f"removed={result.get('removed')}, "

1791 f"skipped={result.get('skipped')}, "

1792 f"errors={result.get('errors')}"

1793 )

1794 except Exception as e:

1795 # ``password`` was retrieved above and passed into the sync

1796 # service (which opens the encrypted DB). Drop the traceback

1797 # chain and redact str(e) so the SQLCipher master password

1798 # cannot leak under loguru ``diagnose=True``.

1799 safe_msg = redact_secrets(str(e), password)

1800 logger.warning(

1801 f"[ZOTERO_SCHEDULER] Error syncing Zotero for {username}: "

1802 f"{safe_msg}"

1803 )

1804

1805 @thread_cleanup

1806 def _check_user_overdue_subscriptions(self, username: str):

1807 """Check and immediately run any overdue subscriptions for a user."""

1808 # Pre-declared so the except handler can pass it to redact_secrets

1809 # even if the retrieve() call below itself raises.

1810 password = None

1811 try:

1812 session_info = self.user_sessions.get(username)

1813 if not session_info:

1814 return

1815

1816 password = self._credential_store.retrieve(username)

1817 if not password:

1818 return

1819

1820 # Get user's overdue subscriptions

1821 from ..database.session_context import get_user_db_session

1822 from ..database.models.news import NewsSubscription

1823 from datetime import timezone

1824

1825 with get_user_db_session(username, password) as db:

1826 now = datetime.now(timezone.utc)

1827 overdue_subs = (

1828 db.query(NewsSubscription)

1829 .filter(NewsSubscription.due_filter(now))

1830 .all()

1831 )

1832

1833 if overdue_subs:

1834 logger.info(

1835 f"Found {len(overdue_subs)} overdue subscriptions for {username}"

1836 )

1837

1838 for sub in overdue_subs:

1839 # Run immediately with small random delay

1840 # Security: random delay to stagger overdue jobs, not security-sensitive

1841 delay_seconds = random.randint(1, 30)

1842 job_id = (

1843 f"overdue_{username}_{sub.id}_{int(now.timestamp())}"

1844 )

1845

1846 self.scheduler.add_job(

1847 func=self._wrap_job(self._check_subscription),

1848 args=[username, sub.id],

1849 trigger="date",

1850 run_date=now + timedelta(seconds=delay_seconds),

1851 id=job_id,

1852 name=f"Overdue: {sub.name or sub.query_or_topic[:30]}",

1853 replace_existing=True,

1854 )

1855

1856 logger.info(

1857 f"Scheduled overdue subscription {sub.id} to run in {delay_seconds} seconds"

1858 )

1859

1860 except Exception as e:

1861 # ``password`` was retrieved above and passed into

1862 # ``get_user_db_session``. Drop traceback + redact str(e)

1863 # to avoid leaking the SQLCipher master password.

1864 safe_msg = redact_secrets(str(e), password)

1865 logger.warning(

1866 f"Error checking overdue subscriptions for {username}: {safe_msg}"

1867 )

1868

1869 @thread_cleanup

1870 def _check_subscription(self, username: str, subscription_id: int):

1871 """Check and refresh a single subscription."""

1872 logger.info(

1873 f"_check_subscription called for user {username}, subscription {subscription_id}"

1874 )

1875 # Pre-declared so the except handler can pass it to redact_secrets

1876 # even if the retrieve() call below itself raises.

1877 password = None

1878 try:

1879 session_info = self.user_sessions.get(username)

1880 if not session_info:

1881 # User no longer active, cancel job

1882 job_id = f"{username}_{subscription_id}"

1883 try:

1884 self.scheduler.remove_job(job_id)

1885 except JobLookupError:

1886 pass

1887 return

1888

1889 password = self._credential_store.retrieve(username)

1890 if not password: 1890 ↛ 1891line 1890 didn't jump to line 1891 because the condition on line 1890 was never true

1891 logger.warning(

1892 f"Credentials expired for {username}, skipping subscription check"

1893 )

1894 return

1895

1896 # Get subscription details

1897 from ..database.session_context import get_user_db_session

1898 from ..database.models.news import (

1899 NewsSubscription,

1900 SubscriptionStatus,

1901 )

1902 from ..news.subscription_runner import advance_refresh_schedule

1903

1904 with get_user_db_session(username, password) as db:

1905 sub = db.get(NewsSubscription, subscription_id)

1906 if not sub or sub.status != SubscriptionStatus.ACTIVE.value:

1907 logger.info(

1908 f"Subscription {subscription_id} not active, skipping"

1909 )

1910 return

1911

1912 # Prepare query with date replacement using user's timezone

1913 query = sub.query_or_topic

1914 if "YYYY-MM-DD" in query:

1915 from local_deep_research.news.core.utils import (

1916 get_local_date_string,

1917 )

1918 from ..settings.manager import SettingsManager

1919

1920 settings_manager = SettingsManager(db)

1921 local_date = get_local_date_string(settings_manager)

1922 query = query.replace("YYYY-MM-DD", local_date)

1923

1924 # Update last/next refresh times

1925 advance_refresh_schedule(sub, datetime.now(UTC))

1926 db.commit()

1927

1928 subscription_data = {

1929 "id": sub.id,

1930 "name": sub.name,

1931 "query": query,

1932 "original_query": sub.query_or_topic,

1933 "model_provider": sub.model_provider,

1934 "model": sub.model,

1935 "search_strategy": sub.search_strategy,

1936 "search_engine": sub.search_engine,

1937 }

1938

1939 logger.info(

1940 f"Refreshing subscription {subscription_id}: {subscription_data['name']}"

1941 )

1942

1943 # Trigger research synchronously using requests with proper auth

1944 self._trigger_subscription_research_sync(

1945 username, subscription_data

1946 )

1947

1948 # Reschedule for next interval if using interval trigger

1949 job_id = f"{username}_{subscription_id}"

1950 job = self.scheduler.get_job(job_id)

1951 if job and job.trigger.__class__.__name__ == "DateTrigger":

1952 # For date triggers, reschedule

1953 # Security: random jitter to distribute subscription timing, not security-sensitive

1954 next_run = datetime.now(UTC) + timedelta(

1955 minutes=sub.refresh_interval_minutes,

1956 seconds=random.randint(

1957 0, int(self.config.get("max_jitter_seconds", 300))

1958 ),

1959 )

1960 self.scheduler.add_job(

1961 func=self._wrap_job(self._check_subscription),

1962 args=[username, subscription_id],

1963 trigger="date",

1964 run_date=next_run,

1965 id=job_id,

1966 replace_existing=True,

1967 )

1968

1969 except Exception as e:

1970 # ``password`` was retrieved above and passed into

1971 # ``get_user_db_session``. Drop traceback + redact str(e)

1972 # to avoid leaking the SQLCipher master password.

1973 safe_msg = redact_secrets(str(e), password)

1974 logger.warning(

1975 f"Error checking subscription {subscription_id}: {safe_msg}"

1976 )

1977

1978 @thread_cleanup

1979 def _trigger_subscription_research_sync(

1980 self, username: str, subscription: Dict[str, Any]

1981 ):

1982 """Trigger research for a subscription using programmatic API."""

1983 from ..config.thread_settings import set_settings_context

1984

1985 # Pre-declared so the except handler can pass it to redact_secrets

1986 # even if the retrieve() call below itself raises.

1987 password = None

1988 try:

1989 # Get user's password from session info

1990 session_info = self.user_sessions.get(username)

1991 if not session_info:

1992 logger.error(f"No session info for user {username}")

1993 return

1994

1995 password = self._credential_store.retrieve(username)

1996 if not password: 1996 ↛ 1997line 1996 didn't jump to line 1997 because the condition on line 1996 was never true

1997 logger.error(f"Credentials expired for user {username}")

1998 return

1999

2000 # Generate research ID

2001 import uuid

2002

2003 research_id = str(uuid.uuid4())

2004

2005 logger.info(

2006 f"Starting research {research_id} for subscription {subscription['id']}"

2007 )

2008

2009 # Get user settings for research

2010 from ..database.session_context import get_user_db_session

2011 from ..settings.manager import SettingsManager

2012

2013 with get_user_db_session(username, password) as db:

2014 settings_manager = SettingsManager(db)

2015 settings_snapshot = settings_manager.get_settings_snapshot()

2016

2017 # Use the search engine from the subscription if specified

2018 search_engine = subscription.get("search_engine")

2019

2020 if search_engine:

2021 settings_snapshot["search.tool"] = {

2022 "value": search_engine,

2023 "ui_element": "select",

2024 }

2025 logger.info(

2026 f"Using subscription's search engine: '{search_engine}' for {subscription['id']}"

2027 )

2028 else:

2029 # Use the user's default search tool from their settings

2030 default_search_tool = settings_snapshot.get(

2031 "search.tool", DEFAULT_SEARCH_TOOL

2032 )

2033 logger.info(

2034 f"Using user's default search tool: '{default_search_tool}' for {subscription['id']}"

2035 )

2036

2037 logger.debug(

2038 f"Settings snapshot has {len(settings_snapshot)} settings"

2039 )

2040 # Log a few key settings to verify they're present

2041 logger.debug(

2042 f"Key settings: llm.model={settings_snapshot.get('llm.model')}, llm.provider={settings_snapshot.get('llm.provider')}, search.tool={settings_snapshot.get('search.tool')}"

2043 )

2044

2045 # Set up research parameters

2046 query = subscription["query"]

2047

2048 # Build metadata for news search

2049 metadata = {

2050 "is_news_search": True,

2051 "search_type": "news_analysis",

2052 "display_in": "news_feed",

2053 "subscription_id": subscription["id"],

2054 "triggered_by": "scheduler",

2055 "subscription_name": subscription["name"],

2056 "title": subscription["name"] if subscription["name"] else None,

2057 "scheduled_at": datetime.now(UTC).isoformat(),

2058 "original_query": subscription["original_query"],

2059 "user_id": username,

2060 }

2061

2062 # Use programmatic API with settings context

2063 from ..api.research_functions import quick_summary

2064

2065 # Create and set settings context for this thread

2066 settings_context = SnapshotSettingsContext(settings_snapshot)

2067 set_settings_context(settings_context)

2068

2069 # Get search strategy from subscription data

2070 search_strategy = subscription.get("search_strategy")

2071

2072 # Build kwargs for quick_summary, only including

2073 # search_strategy if the subscription specifies one.

2074 quick_summary_kwargs = {

2075 "query": query,

2076 "research_id": research_id,

2077 "username": username,

2078 "user_password": password,

2079 "settings_snapshot": settings_snapshot,

2080 "model_name": subscription.get("model"),

2081 "provider": subscription.get("model_provider"),

2082 "metadata": metadata,

2083 "search_original_query": False, # Don't send long subscription prompts to search engines

2084 }

2085 if search_strategy: 2085 ↛ 2088line 2085 didn't jump to line 2088 because the condition on line 2085 was always true

2086 quick_summary_kwargs["search_strategy"] = search_strategy

2087

2088 result = quick_summary(**quick_summary_kwargs)

2089

2090 logger.info(

2091 f"Completed research {research_id} for subscription {subscription['id']}"

2092 )

2093

2094 # Store the research result in the database

2095 self._store_research_result(

2096 username,

2097 password,

2098 research_id,

2099 subscription["id"],

2100 result,

2101 subscription,

2102 )

2103

2104 except Exception as e:

2105 # ``password`` was retrieved from the credential store at

2106 # the top of this function and passed through to

2107 # ``get_user_db_session``, ``quick_summary``

2108 # (``user_password``), and ``_store_research_result``. A

2109 # SQLAlchemy / requests exception from any of those paths

2110 # could carry frame locals containing the SQLCipher master

2111 # password — drop the traceback chain and redact str(e).

2112 safe_msg = redact_secrets(str(e), password)

2113 logger.warning(

2114 f"Error triggering research for subscription {subscription['id']}: {safe_msg}"

2115 )

2116

2117 def _store_research_result(

2118 self,

2119 username: str,

2120 password: str,

2121 research_id: str,

2122 subscription_id: int,

2123 result: Dict[str, Any],

2124 subscription: Dict[str, Any],

2125 ):

2126 """Store research result in database for news display."""

2127 try:

2128 from ..database.session_context import get_user_db_session

2129 from ..database.models import ResearchHistory

2130 from ..settings.manager import SettingsManager

2131 import json

2132

2133 # Convert result to JSON-serializable format

2134 def make_serializable(obj):

2135 """Convert non-serializable objects to dictionaries."""

2136 if hasattr(obj, "dict"):

2137 return obj.dict()

2138 if hasattr(obj, "__dict__"): 2138 ↛ 2139line 2138 didn't jump to line 2139 because the condition on line 2138 was never true

2139 return {

2140 k: make_serializable(v)

2141 for k, v in obj.__dict__.items()

2142 if not k.startswith("_")

2143 }

2144 if isinstance(obj, (list, tuple)):

2145 return [make_serializable(item) for item in obj]

2146 if isinstance(obj, dict):

2147 return {k: make_serializable(v) for k, v in obj.items()}

2148 return obj

2149

2150 serializable_result = make_serializable(result)

2151

2152 with get_user_db_session(username, password) as db:

2153 # Get user settings to store in metadata

2154 settings_manager = SettingsManager(db)

2155 settings_snapshot = settings_manager.get_settings_snapshot()

2156

2157 # Get the report content - check both 'report' and 'summary' fields

2158 report_content = serializable_result.get(

2159 "report"

2160 ) or serializable_result.get("summary")

2161 logger.debug(

2162 f"Report content length: {len(report_content) if report_content else 0} chars"

2163 )

2164

2165 # Extract sources/links from the result. They get

2166 # persisted to research_resources AFTER history_entry

2167 # commits below (FK requires research_id to exist).

2168 sources = serializable_result.get("sources", [])

2169

2170 # Then format citations in the report content

2171 if report_content:

2172 # Import citation formatter

2173 from ..text_optimization.citation_formatter import (

2174 CitationFormatter,

2175 CitationMode,

2176 )

2177 from ..config.search_config import (

2178 get_setting_from_snapshot,

2179 )

2180

2181 # Get citation format from settings

2182 citation_format = get_setting_from_snapshot(

2183 "report.citation_format", "domain_id_hyperlinks"

2184 )

2185 mode_map = {

2186 "number_hyperlinks": CitationMode.NUMBER_HYPERLINKS,

2187 "domain_hyperlinks": CitationMode.DOMAIN_HYPERLINKS,

2188 "domain_id_hyperlinks": CitationMode.DOMAIN_ID_HYPERLINKS,

2189 "domain_id_always_hyperlinks": CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS,

2190 "source_tagged_hyperlinks": CitationMode.SOURCE_TAGGED_HYPERLINKS,

2191 "no_hyperlinks": CitationMode.NO_HYPERLINKS,

2192 }

2193 mode = mode_map.get(

2194 citation_format, CitationMode.DOMAIN_ID_HYPERLINKS

2195 )

2196 formatter = CitationFormatter(mode=mode)

2197

2198 # Format citations within the content

2199 report_content = formatter.format_document(report_content)

2200

2201 if not report_content:

2202 # If neither field exists, use the full result as JSON

2203 report_content = json.dumps(serializable_result)

2204

2205 # Generate headline and topics for news searches

2206 from ..news.utils.headline_generator import generate_headline

2207 from ..news.utils.topic_generator import generate_topics

2208

2209 query_text = result.get(

2210 "query", subscription.get("query", "News Update")

2211 )

2212

2213 # Generate headline from the actual research findings

2214 logger.info(

2215 f"Generating headline for subscription {subscription_id}"

2216 )

2217 generated_headline = generate_headline(

2218 query=query_text,

2219 findings=report_content,

2220 max_length=200, # Allow longer headlines for news

2221 settings_snapshot=settings_snapshot,

2222 )

2223

2224 # Generate topics from the findings

2225 logger.info(

2226 f"Generating topics for subscription {subscription_id}"

2227 )

2228 generated_topics = generate_topics(

2229 query=query_text,

2230 findings=report_content,

2231 category=subscription.get("name", "News"),

2232 max_topics=6,

2233 settings_snapshot=settings_snapshot,

2234 )

2235

2236 logger.info(

2237 f"Generated headline: {generated_headline}, topics: {generated_topics}"

2238 )

2239

2240 # Get subscription name for metadata

2241 subscription_name = subscription.get("name", "")

2242

2243 # Use generated headline as title, or fallback

2244 if generated_headline:

2245 title = generated_headline

2246 else:

2247 if subscription_name:

2248 title = f"{subscription_name} - {datetime.now(UTC).isoformat(timespec='minutes')}"

2249 else:

2250 title = f"{query_text[:60]}... - {datetime.now(UTC).isoformat(timespec='minutes')}"

2251

2252 # Create research history entry

2253 history_entry = ResearchHistory(

2254 id=research_id,

2255 query=result.get("query", ""),

2256 mode="news_subscription",

2257 status="completed",

2258 created_at=datetime.now(UTC).isoformat(),

2259 completed_at=datetime.now(UTC).isoformat(),

2260 title=title,

2261 research_meta={

2262 "subscription_id": subscription_id,

2263 "triggered_by": "scheduler",

2264 "is_news_search": True,

2265 "username": username,

2266 "subscription_name": subscription_name, # Store subscription name for display

2267 "settings_snapshot": settings_snapshot, # Store settings snapshot for later retrieval

2268 "generated_headline": generated_headline, # Store generated headline for news display

2269 "generated_topics": generated_topics, # Store topics for categorization

2270 },

2271 )

2272 db.add(history_entry)

2273 db.commit()

2274

2275 # Persist sources to research_resources so the assembler

2276 # can rebuild the Sources block at render time. Was

2277 # previously written INLINE into report_content via a

2278 # "## Sources" tail — the report_content refactor moves

2279 # this to structured storage matching normal research.

2280 if sources:

2281 try:

2282 from ..web.services.research_sources_service import (

2283 ResearchSourcesService,

2284 )

2285

2286 ResearchSourcesService.save_research_sources(

2287 research_id=research_id,

2288 sources=sources,

2289 username=username,

2290 )

2291 except Exception as e:

2292 # ``password`` is a parameter of this method —

2293 # don't render a traceback that could expose it

2294 # via diagnose=True frame locals.

2295 safe_msg = redact_secrets(str(e), password)

2296 logger.warning(

2297 "Failed to persist scheduler sources for "

2298 "research {} — assembler will render no Sources "

2299 "block for this row: {}",

2300 research_id,

2301 safe_msg,

2302 )

2303

2304 # Store the report content using storage abstraction

2305 from ..storage import get_report_storage

2306

2307 # Use storage to save the report (report_content already retrieved above)

2308 storage = get_report_storage(session=db)

2309 storage.save_report(

2310 research_id=research_id,

2311 content=report_content,

2312 username=username,

2313 )

2314

2315 logger.info(

2316 f"Stored research result {research_id} for subscription {subscription_id}"

2317 )

2318

2319 except Exception as e:

2320 # ``password`` is a function parameter, so it is always in

2321 # this frame. Drop traceback + redact str(e) to avoid leaking

2322 # the SQLCipher master password.

2323 safe_msg = redact_secrets(str(e), password)

2324 logger.warning(f"Error storing research result: {safe_msg}")

2325

2326 def _run_cleanup_with_tracking(self):

2327 """Wrapper that tracks cleanup execution."""

2328

2329 try:

2330 cleaned_count = self._cleanup_inactive_users()

2331

2332 logger.info(

2333 f"Cleanup successful: removed {cleaned_count} inactive users"

2334 )

2335

2336 except Exception:

2337 logger.exception("Cleanup job failed")

2338

2339 def _cleanup_inactive_users(self) -> int:

2340 """Remove users inactive for longer than retention period."""

2341 retention_hours = self.config.get("retention_hours", 48)

2342 cutoff = datetime.now(UTC) - timedelta(hours=retention_hours)

2343

2344 cleaned_count = 0

2345

2346 with self.lock:

2347 inactive_users = [

2348 user_id

2349 for user_id, session in self.user_sessions.items()

2350 if session["last_activity"] < cutoff

2351 ]

2352

2353 for user_id in inactive_users:

2354 # Remove all scheduled jobs

2355 for job_id in self.user_sessions[user_id][

2356 "scheduled_jobs"

2357 ].copy():

2358 try:

2359 self.scheduler.remove_job(job_id)

2360 except JobLookupError:

2361 pass

2362

2363 # Clear credentials and session data

2364 self._credential_store.clear(user_id)

2365 del self.user_sessions[user_id]

2366 cleaned_count += 1

2367 logger.info(f"Cleaned up inactive user {user_id}")

2368

2369 return cleaned_count

2370

2371 def _reload_config(self):

2372 """Reload configuration from settings manager."""

2373 if not hasattr(self, "settings_manager") or not self.settings_manager:

2374 return

2375

2376 try:

2377 old_retention = self.config.get("retention_hours", 48)

2378

2379 # Reload all settings

2380 for key in self.config:

2381 if key == "enabled":

2382 continue # Don't change enabled state while running

2383

2384 full_key = f"news.scheduler.{key}"

2385 self.config[key] = self._get_setting(full_key, self.config[key])

2386

2387 # Handle changes that need immediate action

2388 if old_retention != self.config["retention_hours"]:

2389 logger.info(

2390 f"Retention period changed from {old_retention} "

2391 f"to {self.config['retention_hours']} hours"

2392 )

2393 # Trigger immediate cleanup with new retention

2394 self.scheduler.add_job(

2395 self._wrap_job(self._run_cleanup_with_tracking),

2396 "date",

2397 run_date=datetime.now(UTC) + timedelta(seconds=5),

2398 id="immediate_cleanup_config_change",

2399 )

2400

2401 # Clear settings cache to pick up any user setting changes

2402 self.invalidate_all_settings_cache()

2403

2404 except Exception:

2405 logger.exception("Error reloading configuration")

2406

2407 def get_status(self) -> Dict[str, Any]:

2408 """Get scheduler status information."""

2409 with self.lock:

2410 active_users = len(self.user_sessions)

2411 total_jobs = sum(

2412 len(session["scheduled_jobs"])

2413 for session in self.user_sessions.values()

2414 )

2415

2416 # Get next run time for cleanup job

2417 next_cleanup = None

2418 if self.is_running:

2419 job = self.scheduler.get_job("cleanup_inactive_users")

2420 if job: 2420 ↛ 2423line 2420 didn't jump to line 2423 because the condition on line 2420 was always true

2421 next_cleanup = job.next_run_time

2422

2423 return {

2424 "is_running": self.is_running,

2425 "config": self.config,

2426 "active_users": active_users,

2427 "total_scheduled_jobs": total_jobs,

2428 "next_cleanup": next_cleanup.isoformat() if next_cleanup else None,

2429 "memory_usage": self._estimate_memory_usage(),

2430 }

2431

2432 def _estimate_memory_usage(self) -> int:

2433 """Estimate memory usage of user sessions."""

2434

2435 # Rough estimate: username (50) + password (100) + metadata (200) per user

2436 per_user_estimate = 350

2437 return len(self.user_sessions) * per_user_estimate

2438

2439 def get_user_sessions_summary(self) -> List[Dict[str, Any]]:

2440 """Get summary of active user sessions (without passwords)."""

2441 with self.lock:

2442 summary = []

2443 for user_id, session in self.user_sessions.items():

2444 summary.append(

2445 {

2446 "user_id": user_id,

2447 "last_activity": session["last_activity"].isoformat(),

2448 "scheduled_jobs": len(session["scheduled_jobs"]),

2449 "time_since_activity": str(

2450 datetime.now(UTC) - session["last_activity"]

2451 ),

2452 }

2453 )

2454 return summary

2455

2456

2457# Singleton instance getter

2458_scheduler_instance = None

2459

2460

2461def get_background_job_scheduler() -> BackgroundJobScheduler:

2462 """Get the singleton news scheduler instance."""

2463 global _scheduler_instance

2464 if _scheduler_instance is None:

2465 _scheduler_instance = BackgroundJobScheduler()

2466 return _scheduler_instance