Coverage for src/local_deep_research/web/routes/research

1import io

2import json

3from datetime import datetime, UTC

4from pathlib import Path

6from flask import (

7 Blueprint,

8 g,

9 jsonify,

10 redirect,

11 request,

12 send_file,

13 session,

14 url_for,

15)

16from loguru import logger

17from ...settings.logger import log_settings

18from sqlalchemy import func

20# Security imports

21from ...constants import ResearchStatus

22from ...security import (

23 FileUploadValidator,

24 filter_research_metadata,

25 strip_settings_snapshot,

26 upload_rate_limit,

27)

28from ...config.paths import get_config_directory

30# Services imports

31from ..services.pdf_extraction_service import get_pdf_extraction_service

33from ...database.models import (

34 QueuedResearch,

35 ResearchHistory,

36 ResearchLog,

37 UserActiveResearch,

38)

39from ...database.models.library import Document as Document

40from ...database.session_context import get_user_db_session

41from ..auth.decorators import login_required

42from ..models.database import calculate_duration

43from ..services.research_service import (

44 export_report_to_memory,

45 run_research_process,

46 start_research_process,

47)

48from ..utils.rate_limiter import limiter

49from ..utils.templates import render_template_with_defaults

50from .globals import active_research, termination_flags

52# Create a Blueprint for the research application

53research_bp = Blueprint("research", __name__)

56# Add static route at the root level

57@research_bp.route("/redirect-static/<path:path>")

58def redirect_static(path):

59 """Redirect old static URLs to new static URLs"""

60 return redirect(url_for("static", filename=path))

63@research_bp.route("/progress/<string:research_id>")

64@login_required

65def progress_page(research_id):

66 """Render the research progress page"""

67 return render_template_with_defaults("pages/progress.html")

70@research_bp.route("/details/<string:research_id>")

71@login_required

72def research_details_page(research_id):

73 """Render the research details page"""

74 return render_template_with_defaults("pages/details.html")

77@research_bp.route("/results/<string:research_id>")

78@login_required

79def results_page(research_id):

80 """Render the research results page"""

81 return render_template_with_defaults("pages/results.html")

84@research_bp.route("/history")

85@login_required

86def history_page():

87 """Render the history page"""

88 return render_template_with_defaults("pages/history.html")

91# Add missing settings routes

92@research_bp.route("/settings", methods=["GET"])

93@login_required

94def settings_page():

95 """Render the settings page"""

96 return render_template_with_defaults("settings_dashboard.html")

99@research_bp.route("/settings/main", methods=["GET"])

100@login_required

101def main_config_page():

102 """Render the main settings config page"""

103 return render_template_with_defaults("main_config.html")

104

105

106@research_bp.route("/settings/collections", methods=["GET"])

107@login_required

108def collections_config_page():

109 """Render the collections config page"""

110 return render_template_with_defaults("collections_config.html")

111

112

113@research_bp.route("/settings/api_keys", methods=["GET"])

114@login_required

115def api_keys_config_page():

116 """Render the API keys config page"""

117 return render_template_with_defaults("api_keys_config.html")

118

119

120@research_bp.route("/settings/search_engines", methods=["GET"])

121@login_required

122def search_engines_config_page():

123 """Render the search engines config page"""

124 return render_template_with_defaults("search_engines_config.html")

125

126

127@research_bp.route("/settings/llm", methods=["GET"])

128@login_required

129def llm_config_page():

130 """Render the LLM config page"""

131 return render_template_with_defaults("llm_config.html")

132

133

134@research_bp.route("/api/start_research", methods=["POST"])

135@login_required

136def start_research():

137 data = request.json

138 # Debug logging to trace model parameter

139 logger.debug(f"Request data keys: {list(data.keys()) if data else 'None'}")

140

141 # Check if this is a news search

142 metadata = data.get("metadata", {})

143 if metadata.get("is_news_search"): 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 logger.info(

145 f"News search request received: triggered_by={metadata.get('triggered_by', 'unknown')}"

146 )

147

148 query = data.get("query")

149 mode = data.get("mode", "quick")

150

151 # Replace date placeholders if they exist

152 if query and "YYYY-MM-DD" in query: 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was never true

153 # Use local system time

154 current_date = datetime.now(UTC).strftime("%Y-%m-%d")

155

156 original_query = query

157 query = query.replace("YYYY-MM-DD", current_date)

158 logger.info(

159 f"Replaced date placeholder in query: {original_query[:100]}... -> {query[:100]}..."

160 )

161 logger.info(f"Using date: {current_date}")

162

163 # Update metadata to track the replacement

164 if not metadata:

165 metadata = {}

166 metadata["original_query"] = original_query

167 metadata["processed_query"] = query

168 metadata["date_replaced"] = current_date

169 data["metadata"] = metadata

170

171 # Get parameters from request or use database settings

172 from ...settings.manager import SettingsManager

173

174 username = session.get("username")

175 if not username: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 return jsonify({"error": "Not authenticated"}), 401

177

178 with get_user_db_session(username) as db_session:

179 settings_manager = SettingsManager(db_session=db_session)

180

181 # Get model provider and model selections - use database settings if not provided

182 model_provider = data.get("model_provider")

183 if not model_provider:

184 model_provider = settings_manager.get_setting(

185 "llm.provider", "OLLAMA"

186 )

187 logger.debug(

188 f"No model_provider in request, using database setting: {model_provider}"

189 )

190 else:

191 logger.debug(f"Using model_provider from request: {model_provider}")

192

193 model = data.get("model")

194 if not model:

195 model = settings_manager.get_setting("llm.model", None)

196 logger.debug(

197 f"No model in request, using database setting: {model}"

198 )

199 else:

200 logger.debug(f"Using model from request: {model}")

201

202 custom_endpoint = data.get("custom_endpoint")

203 if not custom_endpoint and model_provider == "OPENAI_ENDPOINT": 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 custom_endpoint = settings_manager.get_setting(

205 "llm.openai_endpoint.url", None

206 )

207 logger.debug(

208 f"No custom_endpoint in request, using database setting: {custom_endpoint}"

209 )

210

211 # Get Ollama URL from request or settings

212 ollama_url = data.get("ollama_url")

213 if not ollama_url and model_provider == "OLLAMA": 213 ↛ 221line 213 didn't jump to line 221 because the condition on line 213 was always true

214 ollama_url = settings_manager.get_setting(

215 "llm.ollama.url", "http://localhost:11434"

216 )

217 logger.debug(

218 f"No ollama_url in request, using database setting: {ollama_url}"

219 )

220

221 search_engine = data.get("search_engine") or data.get("search_tool")

222 if not search_engine:

223 search_engine = settings_manager.get_setting(

224 "search.tool", "searxng"

225 )

226

227 max_results = data.get("max_results")

228 time_period = data.get("time_period")

229

230 iterations = data.get("iterations")

231 if iterations is None:

232 iterations = settings_manager.get_setting("search.iterations", 5)

233

234 questions_per_iteration = data.get("questions_per_iteration")

235 if questions_per_iteration is None:

236 questions_per_iteration = settings_manager.get_setting(

237 "search.questions_per_iteration", 5

238 )

239

240 # Get strategy from request or database

241 strategy = data.get("strategy")

242 if not strategy:

243 strategy = settings_manager.get_setting(

244 "search.search_strategy", "source-based"

245 )

246

247 # Debug logging for model parameter specifically

248 logger.debug(

249 f"Extracted model value: '{model}' (type: {type(model).__name__})"

250 )

251

252 # Log the selections for troubleshooting

253 logger.info(

254 f"Starting research with provider: {model_provider}, model: {model}, search engine: {search_engine}"

255 )

256 logger.info(

257 f"Additional parameters: max_results={max_results}, time_period={time_period}, iterations={iterations}, questions={questions_per_iteration}, strategy={strategy}"

258 )

259

260 if not query:

261 return jsonify({"status": "error", "message": "Query is required"}), 400

262

263 # Validate required parameters based on provider

264 if model_provider == "OPENAI_ENDPOINT" and not custom_endpoint: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 return (

266 jsonify(

267 {

268 "status": "error",

269 "message": "Custom endpoint URL is required for OpenAI endpoint provider",

270 }

271 ),

272 400,

273 )

274

275 if not model: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 logger.error(

277 f"No model specified or configured. Provider: {model_provider}"

278 )

279 return jsonify(

280 {

281 "status": "error",

282 "message": "Model is required. Please configure a model in the settings.",

283 }

284 ), 400

285

286 # Check if the user has too many active researches

287 username = session.get("username")

288

289 # Get max concurrent researches from settings

290 from ...settings import SettingsManager

291

292 with get_user_db_session() as db_session:

293 settings_manager = SettingsManager(db_session)

294 max_concurrent_researches = settings_manager.get_setting(

295 "app.max_concurrent_researches", 3

296 )

297

298 # Use existing session from g to check active researches

299 try:

300 if hasattr(g, "db_session") and g.db_session: 300 ↛ 316line 300 didn't jump to line 316 because the condition on line 300 was always true

301 # Count active researches for this user

302 active_count = (

303 g.db_session.query(UserActiveResearch)

304 .filter_by(username=username, status=ResearchStatus.IN_PROGRESS)

305 .count()

306 )

307

308 # Debug logging

309 logger.info(

310 f"Active research count for {username}: {active_count}/{max_concurrent_researches}"

311 )

312

313 should_queue = active_count >= max_concurrent_researches

314 logger.info(f"Should queue new research: {should_queue}")

315 else:

316 logger.warning(

317 "No database session available to check active researches"

318 )

319 should_queue = False

320 except Exception:

321 logger.exception("Failed to check active researches")

322 # Default to not queueing if we can't check

323 should_queue = False

324

325 # Create a record in the database with explicit UTC timestamp

326 import uuid

327 import threading

328

329 created_at = datetime.now(UTC).isoformat()

330 research_id = str(uuid.uuid4())

331

332 # Create organized research metadata with settings snapshot

333 research_settings = {

334 # Direct submission parameters

335 "submission": {

336 "model_provider": model_provider,

337 "model": model,

338 "custom_endpoint": custom_endpoint,

339 "search_engine": search_engine,

340 "max_results": max_results,

341 "time_period": time_period,

342 "iterations": iterations,

343 "questions_per_iteration": questions_per_iteration,

344 "strategy": strategy,

345 },

346 # System information

347 "system": {

348 "timestamp": created_at,

349 "user": username,

350 "version": "1.0", # Track metadata version for future migrations

351 "server_url": request.host_url, # Add server URL for link generation

352 },

353 }

354

355 # Add any additional metadata from request

356 additional_metadata = data.get("metadata", {})

357 if additional_metadata: 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true

358 research_settings.update(additional_metadata)

359 # Get complete settings snapshot for this research

360 try:

361 from local_deep_research.settings import SettingsManager

362

363 # Use the existing session from g (set by middleware)

364 if hasattr(g, "db_session") and g.db_session: 364 ↛ 383line 364 didn't jump to line 383 because the condition on line 364 was always true

365 # Create SettingsManager with the existing session

366 username = session.get("username")

367 # Ensure any pending changes are committed

368 try:

369 g.db_session.commit()

370 except Exception:

371 g.db_session.rollback()

372 settings_manager = SettingsManager(g.db_session)

373 # Get all current settings as a snapshot (bypass cache to ensure fresh data)

374 all_settings = settings_manager.get_all_settings(bypass_cache=True)

375

376 # Add settings snapshot to metadata

377 research_settings["settings_snapshot"] = all_settings

378 logger.info(

379 f"Captured {len(all_settings)} settings for research {research_id}"

380 )

381 else:

382 # If no session in g, create a new one temporarily to get settings

383 logger.warning(

384 "No database session in g, creating temporary session for settings snapshot"

385 )

386 from ...database.thread_local_session import get_metrics_session

387

388 # Get password from session or g

389 password = getattr(g, "user_password", None)

390 if not password:

391 # Try to get from session password store

392 from ...database.session_passwords import session_password_store

393

394 session_id = session.get("session_id")

395 if session_id:

396 password = session_password_store.get_session_password(

397 username, session_id

398 )

399

400 if password:

401 temp_session = get_metrics_session(username, password)

402 if temp_session:

403 username = session.get("username")

404 settings_manager = SettingsManager(temp_session)

405 all_settings = settings_manager.get_all_settings(

406 bypass_cache=True

407 )

408 research_settings["settings_snapshot"] = all_settings

409 logger.info(

410 f"Captured {len(all_settings)} settings using temporary session for research {research_id}"

411 )

412 else:

413 logger.error(

414 "Failed to create temporary session for settings snapshot"

415 )

416 raise Exception(

417 "Cannot create research without settings snapshot"

418 )

419 else:

420 logger.error(

421 "No password available to create session for settings snapshot"

422 )

423 raise Exception(

424 "Cannot create research without settings snapshot"

425 )

426 except Exception:

427 logger.exception("Failed to capture settings snapshot")

428 # Cannot continue without settings snapshot for thread-based research

429 return jsonify(

430 {

431 "status": "error",

432 "message": "Failed to capture settings for research. Please try again.",

433 }

434 ), 500

435

436 # Use existing session from g

437 username = session.get("username")

438 if not username: 438 ↛ 439line 438 didn't jump to line 439 because the condition on line 438 was never true

439 return jsonify({"status": "error", "message": "Not authenticated"}), 401

440

441 try:

442 # Use existing session from g

443 if hasattr(g, "db_session") and g.db_session: 443 ↛ 633line 443 didn't jump to line 633 because the condition on line 443 was always true

444 db_session = g.db_session

445 # Determine initial status based on whether we need to queue

446 initial_status = (

447 ResearchStatus.QUEUED

448 if should_queue

449 else ResearchStatus.IN_PROGRESS

450 )

451

452 research = ResearchHistory(

453 id=research_id, # Set UUID as primary key

454 query=query,

455 mode=mode,

456 status=initial_status,

457 created_at=created_at,

458 progress_log=[{"time": created_at, "progress": 0}],

459 research_meta=research_settings,

460 )

461 db_session.add(research)

462 db_session.commit()

463 logger.info(

464 f"Created research entry with UUID: {research_id}, status: {initial_status}"

465 )

466

467 if should_queue: 467 ↛ 470line 467 didn't jump to line 470 because the condition on line 467 was never true

468 # Add to queue instead of starting immediately

469 # Get the next position in queue for this user

470 max_position = (

471 db_session.query(func.max(QueuedResearch.position))

472 .filter_by(username=username)

473 .scalar()

474 or 0

475 )

476

477 queued_record = QueuedResearch(

478 username=username,

479 research_id=research_id,

480 query=query,

481 mode=mode,

482 settings_snapshot=research_settings,

483 position=max_position + 1,

484 )

485 db_session.add(queued_record)

486 db_session.commit()

487 logger.info(

488 f"Queued research {research_id} at position {max_position + 1} for user {username}"

489 )

490

491 # Notify queue processor with all parameters for potential direct execution

492 from ..queue.processor_v2 import queue_processor

493

494 # Get session ID for password access

495 session_id = session.get("session_id")

496

497 # Pass all parameters needed for direct execution

498 queue_processor.notify_research_queued(

499 username,

500 research_id,

501 session_id=session_id,

502 query=query,

503 mode=mode,

504 settings_snapshot=research_settings,

505 model_provider=model_provider,

506 model=model,

507 custom_endpoint=custom_endpoint,

508 search_engine=search_engine,

509 max_results=max_results,

510 time_period=time_period,

511 iterations=iterations,

512 questions_per_iteration=questions_per_iteration,

513 strategy=strategy,

514 )

515

516 # Return queued status

517 return jsonify(

518 {

519 "status": ResearchStatus.QUEUED,

520 "research_id": research_id,

521 "queue_position": max_position + 1,

522 "message": f"Your research has been queued. Position in queue: {max_position + 1}",

523 }

524 )

525 else:

526 # Start immediately

527 # Create active research tracking record

528 import threading

529

530 active_record = UserActiveResearch(

531 username=username,

532 research_id=research_id,

533 status=ResearchStatus.IN_PROGRESS,

534 thread_id=str(threading.current_thread().ident),

535 settings_snapshot=research_settings,

536 )

537 db_session.add(active_record)

538 db_session.commit()

539 logger.info(

540 f"Created active research record for user {username}"

541 )

542

543 # Double-check the count after committing to handle race conditions

544 # Use the existing session for the recheck

545 try:

546 # Use the same session we already have

547 recheck_session = db_session

548 final_count = (

549 recheck_session.query(UserActiveResearch)

550 .filter_by(

551 username=username, status=ResearchStatus.IN_PROGRESS

552 )

553 .count()

554 )

555 logger.info(

556 f"Final active count after commit: {final_count}/{max_concurrent_researches}"

557 )

558

559 if final_count > max_concurrent_researches: 559 ↛ 562line 559 didn't jump to line 562 because the condition on line 559 was never true

560 # We exceeded the limit due to a race condition

561 # Remove this record and queue instead

562 logger.warning(

563 f"Race condition detected: {final_count} > {max_concurrent_researches}, moving to queue"

564 )

565 db_session.delete(active_record)

566 db_session.commit()

567

568 # Add to queue

569 max_position = (

570 db_session.query(func.max(QueuedResearch.position))

571 .filter_by(username=username)

572 .scalar()

573 or 0

574 )

575

576 queued_record = QueuedResearch(

577 username=username,

578 research_id=research_id,

579 query=query,

580 mode=mode,

581 settings_snapshot=research_settings,

582 position=max_position + 1,

583 )

584 db_session.add(queued_record)

585

586 # Update research status to queued

587 research.status = ResearchStatus.QUEUED

588 db_session.commit()

589

590 # Notify queue processor for potential direct execution

591 from ..queue.processor_v2 import queue_processor

592

593 # Get session ID for password access

594 session_id = session.get("session_id")

595

596 # Pass all parameters needed for direct execution

597 queue_processor.notify_research_queued(

598 username,

599 research_id,

600 session_id=session_id,

601 query=query,

602 mode=mode,

603 settings_snapshot=research_settings,

604 model_provider=model_provider,

605 model=model,

606 custom_endpoint=custom_endpoint,

607 search_engine=search_engine,

608 max_results=max_results,

609 time_period=time_period,

610 iterations=iterations,

611 questions_per_iteration=questions_per_iteration,

612 strategy=strategy,

613 )

614

615 return jsonify(

616 {

617 "status": ResearchStatus.QUEUED,

618 "research_id": research_id,

619 "queue_position": max_position + 1,

620 "message": f"Your research has been queued due to concurrent limit. Position in queue: {max_position + 1}",

621 }

622 )

623 except Exception as e:

624 logger.warning(f"Could not recheck active count: {e}")

625

626 except Exception:

627 logger.exception("Failed to create research entry")

628 return jsonify(

629 {"status": "error", "message": "Failed to create research entry"}

630 ), 500

631

632 # Only start the research if not queued

633 if not should_queue: 633 ↛ 720line 633 didn't jump to line 720 because the condition on line 633 was always true

634 # Save the research strategy to the database before starting the thread

635 try:

636 from ..services.research_service import save_research_strategy

637

638 save_research_strategy(research_id, strategy, username=username)

639 except Exception as e:

640 logger.warning(f"Could not save research strategy: {e}")

641

642 # Debug logging for settings snapshot

643 snapshot_data = research_settings.get("settings_snapshot", {})

644 log_settings(snapshot_data, "Settings snapshot being passed to thread")

645 if "search.tool" in snapshot_data: 645 ↛ 650line 645 didn't jump to line 650 because the condition on line 645 was always true

646 logger.debug(

647 f"search.tool in snapshot: {snapshot_data['search.tool']}"

648 )

649 else:

650 logger.debug("search.tool NOT in snapshot")

651

652 # Get the user's password for metrics access in background thread

653 # Try session password store first

654 from ...database.session_passwords import session_password_store

655

656 session_id = session.get("session_id")

657 user_password = None

658

659 if session_id: 659 ↛ 665line 659 didn't jump to line 665 because the condition on line 659 was always true

660 user_password = session_password_store.get_session_password(

661 username, session_id

662 )

663

664 # Fallback to g.user_password (set by middleware if temp_auth was used)

665 if not user_password: 665 ↛ 666line 665 didn't jump to line 666 because the condition on line 665 was never true

666 user_password = getattr(g, "user_password", None)

667

668 # Last resort: try temp_auth_store

669 if not user_password: 669 ↛ 670line 669 didn't jump to line 670 because the condition on line 669 was never true

670 from ...database.temp_auth import temp_auth_store

671

672 auth_token = session.get("temp_auth_token")

673 if auth_token:

674 # Use peek_auth to avoid consuming the token

675 auth_data = temp_auth_store.peek_auth(auth_token)

676 if auth_data and auth_data[0] == username:

677 user_password = auth_data[1]

678

679 if not user_password: 679 ↛ 680line 679 didn't jump to line 680 because the condition on line 679 was never true

680 logger.warning(

681 f"No password available for metrics access for user {username}"

682 )

683

684 # Start the research process with the selected parameters

685 research_thread = start_research_process(

686 research_id,

687 query,

688 mode,

689 active_research,

690 termination_flags,

691 run_research_process,

692 username=username, # Pass username to the thread

693 user_password=user_password, # Pass password for database access

694 model_provider=model_provider,

695 model=model,

696 custom_endpoint=custom_endpoint,

697 search_engine=search_engine,

698 max_results=max_results,

699 time_period=time_period,

700 iterations=iterations,

701 questions_per_iteration=questions_per_iteration,

702 strategy=strategy,

703 settings_snapshot=snapshot_data, # Pass complete settings

704 )

705

706 # Update the active research record with the actual thread ID

707 try:

708 with get_user_db_session(username) as thread_session:

709 active_record = (

710 thread_session.query(UserActiveResearch)

711 .filter_by(username=username, research_id=research_id)

712 .first()

713 )

714 if active_record: 714 ↛ 720line 714 didn't jump to line 720

715 active_record.thread_id = str(research_thread.ident)

716 thread_session.commit()

717 except Exception as e:

718 logger.warning(f"Could not update thread ID: {e}")

719

720 return jsonify({"status": "success", "research_id": research_id})

721

722

723@research_bp.route("/api/terminate/<string:research_id>", methods=["POST"])

724@login_required

725def terminate_research(research_id):

726 """Terminate an in-progress research process"""

727 username = session.get("username")

728 if not username: 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true

729 return jsonify({"error": "Not authenticated"}), 401

730

731 # Check if the research exists and is in progress

732 try:

733 with get_user_db_session(username) as db_session:

734 research = (

735 db_session.query(ResearchHistory)

736 .filter_by(id=research_id)

737 .first()

738 )

739

740 if not research: 740 ↛ 741line 740 didn't jump to line 741 because the condition on line 740 was never true

741 return jsonify(

742 {"status": "error", "message": "Research not found"}

743 ), 404

744

745 status = research.status

746

747 # If it's already in a terminal state, return success

748 if status in (

749 ResearchStatus.COMPLETED,

750 ResearchStatus.SUSPENDED,

751 ResearchStatus.FAILED,

752 ResearchStatus.ERROR,

753 ):

754 return jsonify(

755 {

756 "status": "success",

757 "message": f"Research already {status}",

758 }

759 )

760

761 # Check if it's in the active_research dict

762 if research_id not in active_research:

763 # Update the status in the database

764 research.status = ResearchStatus.SUSPENDED

765 db_session.commit()

766 return jsonify(

767 {"status": "success", "message": "Research terminated"}

768 )

769

770 # Set the termination flag

771 termination_flags[research_id] = True

772

773 # Log the termination request - using UTC timestamp

774 timestamp = datetime.now(UTC).isoformat()

775 termination_message = "Research termination requested by user"

776 current_progress = active_research[research_id]["progress"]

777

778 # Create log entry

779 log_entry = {

780 "time": timestamp,

781 "message": termination_message,

782 "progress": current_progress,

783 "metadata": {"phase": "termination"},

784 }

785

786 # Add to in-memory log

787 active_research[research_id]["log"].append(log_entry)

788

789 # Add to database log

790 logger.log("MILESTONE", f"Research ended: {termination_message}")

791

792 # Update the log in the database

793 if research.progress_log: 793 ↛ 802line 793 didn't jump to line 802 because the condition on line 793 was always true

794 try:

795 if isinstance(research.progress_log, str): 795 ↛ 796line 795 didn't jump to line 796 because the condition on line 795 was never true

796 current_log = json.loads(research.progress_log)

797 else:

798 current_log = research.progress_log

799 except Exception:

800 current_log = []

801 else:

802 current_log = []

803

804 current_log.append(log_entry)

805 research.progress_log = current_log

806 research.status = ResearchStatus.SUSPENDED

807 db_session.commit()

808

809 # Emit a socket event for the termination request

810 try:

811 event_data = {

812 "status": ResearchStatus.SUSPENDED,

813 "message": "Research was suspended by user request",

814 }

815

816 from ..services.socket_service import SocketIOService

817

818 SocketIOService().emit_socket_event(

819 f"research_progress_{research_id}", event_data

820 )

821

822 except Exception:

823 logger.exception("Socket emit error (non-critical)")

824

825 return jsonify(

826 {

827 "status": "success",

828 "message": "Research termination requested",

829 }

830 )

831 except Exception:

832 logger.exception("Error terminating research")

833 return jsonify(

834 {"status": "error", "message": "Failed to terminate research"}

835 ), 500

836

837

838@research_bp.route("/api/delete/<string:research_id>", methods=["DELETE"])

839@login_required

840def delete_research(research_id):

841 """Delete a research record"""

842 username = session.get("username")

843 if not username: 843 ↛ 844line 843 didn't jump to line 844 because the condition on line 843 was never true

844 return jsonify({"error": "Not authenticated"}), 401

845

846 try:

847 with get_user_db_session(username) as db_session:

848 research = (

849 db_session.query(ResearchHistory)

850 .filter_by(id=research_id)

851 .first()

852 )

853

854 if not research: 854 ↛ 855line 854 didn't jump to line 855 because the condition on line 854 was never true

855 return jsonify(

856 {"status": "error", "message": "Research not found"}

857 ), 404

858

859 status = research.status

860 report_path = research.report_path

861

862 # Don't allow deleting research in progress

863 if ( 863 ↛ 867line 863 didn't jump to line 867 because the condition on line 863 was never true

864 status == ResearchStatus.IN_PROGRESS

865 and research_id in active_research

866 ):

867 return (

868 jsonify(

869 {

870 "status": "error",

871 "message": "Cannot delete research that is in progress",

872 }

873 ),

874 400,

875 )

876

877 # Delete report file if it exists

878 if report_path and Path(report_path).exists(): 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true

879 try:

880 Path(report_path).unlink()

881 except Exception:

882 logger.exception("Error removing report file")

883

884 # Delete the database record

885 db_session.delete(research)

886 db_session.commit()

887

888 return jsonify({"status": "success"})

889 except Exception:

890 logger.exception("Error deleting research")

891 return jsonify(

892 {"status": "error", "message": "Failed to delete research"}

893 ), 500

894

895

896@research_bp.route("/api/clear_history", methods=["POST"])

897@login_required

898def clear_history():

899 """Clear all research history"""

900 username = session.get("username")

901 if not username: 901 ↛ 902line 901 didn't jump to line 902 because the condition on line 901 was never true

902 return jsonify({"error": "Not authenticated"}), 401

903

904 try:

905 with get_user_db_session(username) as db_session:

906 # Get all research records first to clean up files

907 research_records = db_session.query(ResearchHistory).all()

908

909 # Clean up report files

910 for research in research_records: 910 ↛ 912line 910 didn't jump to line 912 because the loop on line 910 never started

911 # Skip active research

912 if research.id in active_research:

913 continue

914

915 # Delete report file if it exists

916 if research.report_path and Path(research.report_path).exists():

917 try:

918 Path(research.report_path).unlink()

919 except Exception:

920 logger.exception("Error removing report file")

921

922 # Delete records from the database, except active research

923 if active_research: 923 ↛ 924line 923 didn't jump to line 924 because the condition on line 923 was never true

924 db_session.query(ResearchHistory).filter(

925 ~ResearchHistory.id.in_(list(active_research.keys()))

926 ).delete(synchronize_session=False)

927 else:

928 db_session.query(ResearchHistory).delete(

929 synchronize_session=False

930 )

931

932 db_session.commit()

933

934 return jsonify({"status": "success"})

935 except Exception:

936 logger.exception("Error clearing history")

937 return jsonify(

938 {"status": "error", "message": "Failed to process request"}

939 ), 500

940

941

942@research_bp.route("/open_file_location", methods=["POST"])

943@login_required

944def open_file_location():

945 """Open a file location in the system file explorer.

946

947 Security: This endpoint is disabled for server deployments.

948 It only makes sense for desktop usage where the server and client are on the same machine.

949 """

950 return jsonify(

951 {

952 "status": "error",

953 "message": "This feature is disabled. It is only available in desktop mode.",

954 }

955 ), 403

956

957

958@research_bp.route("/api/save_raw_config", methods=["POST"])

959@login_required

960def save_raw_config():

961 """Save raw configuration"""

962 data = request.json

963 raw_config = data.get("raw_config")

964

965 if not raw_config:

966 return (

967 jsonify(

968 {"success": False, "error": "Raw configuration is required"}

969 ),

970 400,

971 )

972

973 # Security: Parse and validate the TOML to block dangerous keys

974 try:

975 import tomllib

976 except ImportError:

977 import tomli as tomllib

978

979 try:

980 parsed_config = tomllib.loads(raw_config)

981 except Exception as e:

982 logger.warning(f"Invalid TOML configuration: {e}")

983 # Don't expose internal exception details to users (CWE-209)

984 return jsonify(

985 {

986 "success": False,

987 "error": "Invalid TOML syntax. Please check your configuration format.",

988 }

989 ), 400

990

991 # Security: Check for dangerous keys that could enable code execution

992 # These patterns match keys used for dynamic module imports

993 BLOCKED_KEY_PATTERNS = ["module_path", "class_name", "module", "class"]

994

995 def find_blocked_keys(obj, path=""):

996 """Recursively find any blocked keys in the config."""

997 blocked = []

998 if isinstance(obj, dict):

999 for key, value in obj.items():

1000 current_path = f"{path}.{key}" if path else key

1001 key_lower = key.lower()

1002 for pattern in BLOCKED_KEY_PATTERNS:

1003 if pattern in key_lower:

1004 blocked.append(current_path)

1005 break

1006 # Recurse into nested dicts

1007 blocked.extend(find_blocked_keys(value, current_path))

1008 elif isinstance(obj, list):

1009 for i, item in enumerate(obj):

1010 blocked.extend(find_blocked_keys(item, f"{path}[{i}]"))

1011 return blocked

1012

1013 blocked_keys = find_blocked_keys(parsed_config)

1014 if blocked_keys:

1015 logger.warning(

1016 f"Security: Blocked attempt to write config with dangerous keys: {blocked_keys}"

1017 )

1018 return jsonify(

1019 {

1020 "success": False,

1021 "error": "Configuration contains protected keys that cannot be modified",

1022 "blocked_keys": blocked_keys,

1023 }

1024 ), 403

1025

1026 try:

1027 from ...security.file_write_verifier import write_file_verified

1028

1029 # Get the config file path (uses centralized path config, respects LDR_DATA_DIR)

1030 config_dir = get_config_directory()

1031 config_path = config_dir / "config.toml"

1032

1033 # Write the configuration to file

1034 write_file_verified(

1035 config_path,

1036 raw_config,

1037 "system.allow_config_write",

1038 context="system configuration file",

1039 )

1040

1041 return jsonify({"success": True})

1042 except Exception:

1043 logger.exception("Error saving configuration file")

1044 return jsonify(

1045 {"success": False, "error": "Failed to process request"}

1046 ), 500

1047

1048

1049@research_bp.route("/api/history", methods=["GET"])

1050@login_required

1051def get_history():

1052 """Get research history"""

1053 username = session.get("username")

1054 if not username: 1054 ↛ 1055line 1054 didn't jump to line 1055 because the condition on line 1054 was never true

1055 return jsonify({"error": "Not authenticated"}), 401

1056

1057 try:

1058 with get_user_db_session(username) as db_session:

1059 # Query all research history ordered by created_at

1060 research_records = (

1061 db_session.query(ResearchHistory)

1062 .order_by(ResearchHistory.created_at.desc())

1063 .all()

1064 )

1065

1066 # Build history items while session is active to avoid

1067 # DetachedInstanceError on ORM attribute access

1068 history_items = []

1069 for research in research_records: 1069 ↛ 1071line 1069 didn't jump to line 1071 because the loop on line 1069 never started

1070 # Calculate duration if completed

1071 duration_seconds = None

1072 if research.completed_at and research.created_at:

1073 try:

1074 duration_seconds = calculate_duration(

1075 research.created_at, research.completed_at

1076 )

1077 except Exception:

1078 logger.exception("Error calculating duration")

1079

1080 # Count documents in the library for this research

1081 doc_count = (

1082 db_session.query(Document)

1083 .filter_by(research_id=research.id)

1084 .count()

1085 )

1086

1087 # Create a history item

1088 item = {

1089 "id": research.id,

1090 "query": research.query,

1091 "mode": research.mode,

1092 "status": research.status,

1093 "created_at": research.created_at,

1094 "completed_at": research.completed_at,

1095 "duration_seconds": duration_seconds,

1096 "metadata": filter_research_metadata(

1097 research.research_meta

1098 ),

1099 "document_count": doc_count,

1100 }

1101

1102 # Add title if it exists

1103 if hasattr(research, "title") and research.title is not None:

1104 item["title"] = research.title

1105

1106 history_items.append(item)

1107

1108 return jsonify({"status": "success", "items": history_items})

1109 except Exception:

1110 logger.exception("Error getting history")

1111 return jsonify(

1112 {"status": "error", "message": "Failed to process request"}

1113 ), 500

1114

1115

1116@research_bp.route("/api/research/<string:research_id>")

1117@login_required

1118def get_research_details(research_id):

1119 """Get full details of a research using ORM"""

1120 username = session.get("username")

1121 if not username: 1121 ↛ 1122line 1121 didn't jump to line 1122 because the condition on line 1121 was never true

1122 return jsonify({"error": "Not authenticated"}), 401

1123

1124 try:

1125 with get_user_db_session(username) as db_session:

1126 research = (

1127 db_session.query(ResearchHistory)

1128 .filter(ResearchHistory.id == research_id)

1129 .first()

1130 )

1131

1132 if not research:

1133 return jsonify({"error": "Research not found"}), 404

1134

1135 return jsonify(

1136 {

1137 "id": research.id,

1138 "query": research.query,

1139 "status": research.status,

1140 "progress": research.progress,

1141 "progress_percentage": research.progress or 0,

1142 "mode": research.mode,

1143 "created_at": research.created_at,

1144 "completed_at": research.completed_at,

1145 "report_path": research.report_path,

1146 "metadata": strip_settings_snapshot(research.research_meta),

1147 }

1148 )

1149 except Exception:

1150 logger.exception("Error getting research details")

1151 return jsonify({"error": "An internal error has occurred"}), 500

1152

1153

1154@research_bp.route("/api/research/<string:research_id>/logs")

1155@login_required

1156def get_research_logs(research_id):

1157 """Get logs for a specific research"""

1158 username = session.get("username")

1159 if not username: 1159 ↛ 1160line 1159 didn't jump to line 1160 because the condition on line 1159 was never true

1160 return jsonify({"error": "Not authenticated"}), 401

1161

1162 try:

1163 # First check if the research exists

1164 with get_user_db_session(username) as db_session:

1165 research = (

1166 db_session.query(ResearchHistory)

1167 .filter_by(id=research_id)

1168 .first()

1169 )

1170 if not research: 1170 ↛ 1174line 1170 didn't jump to line 1174 because the condition on line 1170 was always true

1171 return jsonify({"error": "Research not found"}), 404

1172

1173 # Get logs from research_logs table

1174 log_results = (

1175 db_session.query(ResearchLog)

1176 .filter_by(research_id=research_id)

1177 .order_by(ResearchLog.timestamp)

1178 .all()

1179 )

1180

1181 # Extract log attributes while session is active

1182 # to avoid DetachedInstanceError on ORM attribute access

1183 logs = []

1184 for row in log_results:

1185 logs.append(

1186 {

1187 "id": row.id,

1188 "message": row.message,

1189 "timestamp": row.timestamp,

1190 "log_type": row.level,

1191 }

1192 )

1193

1194 return jsonify(logs)

1195

1196 except Exception:

1197 logger.exception("Error getting research logs")

1198 return jsonify({"error": "An internal error has occurred"}), 500

1199

1200

1201@research_bp.route("/api/report/<string:research_id>")

1202@login_required

1203def get_research_report(research_id):

1204 """Get the research report content"""

1205 username = session.get("username")

1206 if not username:

1207 return jsonify({"error": "Not authenticated"}), 401

1208

1209 try:

1210 with get_user_db_session(username) as db_session:

1211 # Query using ORM

1212 research = (

1213 db_session.query(ResearchHistory)

1214 .filter_by(id=research_id)

1215 .first()

1216 )

1217

1218 if research is None:

1219 return jsonify({"error": "Research not found"}), 404

1220

1221 # Parse metadata if it exists

1222 metadata = research.research_meta

1223

1224 # Get report content using storage abstraction

1225 from ...storage import get_report_storage

1226

1227 # Get settings snapshot for this thread

1228 settings_snapshot = (

1229 metadata.get("settings_snapshot") if metadata else None

1230 )

1231

1232 # Pass settings_snapshot to avoid thread context issues

1233 storage = get_report_storage(

1234 session=db_session, settings_snapshot=settings_snapshot

1235 )

1236 content = storage.get_report(research_id, username)

1237

1238 if content is None:

1239 return jsonify({"error": "Report not found"}), 404

1240

1241 # Return the report data with backwards-compatible fields

1242 # Examples expect 'summary', 'sources', 'findings' at top level

1243 safe_metadata = strip_settings_snapshot(metadata)

1244 return jsonify(

1245 {

1246 "content": content,

1247 # Backwards-compatible fields for examples

1248 "summary": content, # The markdown report is the summary

1249 "sources": safe_metadata.get("all_links_of_system", []),

1250 "findings": safe_metadata.get("findings", []),

1251 "metadata": {

1252 "title": research.title if research.title else None,

1253 "query": research.query,

1254 "mode": research.mode if research.mode else None,

1255 "created_at": research.created_at

1256 if research.created_at

1257 else None,

1258 "completed_at": research.completed_at

1259 if research.completed_at

1260 else None,

1261 "report_path": research.report_path,

1262 **safe_metadata,

1263 },

1264 }

1265 )

1266

1267 except Exception:

1268 logger.exception("Error getting research report")

1269 return jsonify({"error": "An internal error has occurred"}), 500

1270

1271

1272@research_bp.route(

1273 "/api/v1/research/<research_id>/export/<format>", methods=["POST"]

1274)

1275@login_required

1276def export_research_report(research_id, format):

1277 """Export research report to different formats (LaTeX, Quarto, RIS, PDF, ODT, etc.)"""

1278 try:

1279 # Use the exporter registry to validate format

1280 from ...exporters import ExporterRegistry

1281

1282 if not ExporterRegistry.is_format_supported(format):

1283 available = ExporterRegistry.get_available_formats()

1284 return jsonify(

1285 {

1286 "error": f"Invalid format. Available formats: {', '.join(available)}"

1287 }

1288 ), 400

1289

1290 # Get research from database

1291 username = session.get("username")

1292 if not username:

1293 return jsonify({"error": "Not authenticated"}), 401

1294

1295 try:

1296 with get_user_db_session(username) as db_session:

1297 research = (

1298 db_session.query(ResearchHistory)

1299 .filter_by(id=research_id)

1300 .first()

1301 )

1302 if not research:

1303 return jsonify({"error": "Research not found"}), 404

1304

1305 # Get report using storage abstraction

1306 from ...storage import get_report_storage

1307

1308 # Get metadata for settings snapshot

1309 metadata = (

1310 research.research_meta if research.research_meta else {}

1311 )

1312 settings_snapshot = (

1313 metadata.get("settings_snapshot") if metadata else None

1314 )

1315

1316 storage = get_report_storage(

1317 session=db_session, settings_snapshot=settings_snapshot

1318 )

1319

1320 # Get report content directly (in memory)

1321 report_content = storage.get_report(research_id, username)

1322 if not report_content:

1323 return jsonify({"error": "Report content not found"}), 404

1324

1325 # Export to requested format (all in memory)

1326 try:

1327 # Use title or query for the PDF title

1328 pdf_title = research.title or research.query

1329

1330 # Generate export content in memory

1331 export_content, filename, mimetype = (

1332 export_report_to_memory(

1333 report_content, format, title=pdf_title

1334 )

1335 )

1336

1337 # Send the file directly from memory

1338 return send_file(

1339 io.BytesIO(export_content),

1340 as_attachment=True,

1341 download_name=filename,

1342 mimetype=mimetype,

1343 )

1344 except Exception:

1345 logger.exception("Error exporting report")

1346 return jsonify(

1347 {

1348 "error": f"Failed to export to {format}. Please try again later."

1349 }

1350 ), 500

1351

1352 except Exception:

1353 logger.exception("Error in export endpoint")

1354 return jsonify({"error": "An internal error has occurred"}), 500

1355

1356 except Exception:

1357 logger.exception("Unexpected error in export endpoint")

1358 return jsonify({"error": "An internal error has occurred"}), 500

1359

1360

1361@research_bp.route("/api/research/<string:research_id>/status")

1362@limiter.exempt

1363@login_required

1364def get_research_status(research_id):

1365 """Get the status of a research process"""

1366 username = session.get("username")

1367 if not username: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true

1368 return jsonify({"error": "Not authenticated"}), 401

1369

1370 try:

1371 with get_user_db_session(username) as db_session:

1372 research = (

1373 db_session.query(ResearchHistory)

1374 .filter_by(id=research_id)

1375 .first()

1376 )

1377

1378 if research is None:

1379 return jsonify({"error": "Research not found"}), 404

1380

1381 status = research.status

1382 progress = research.progress

1383 completed_at = research.completed_at

1384 report_path = research.report_path

1385 metadata = research.research_meta or {}

1386

1387 # Extract and format error information for better UI display

1388 error_info = {}

1389 if metadata and "error" in metadata: 1389 ↛ 1451line 1389 didn't jump to line 1451 because the condition on line 1389 was always true

1390 error_msg = metadata["error"]

1391 error_type = "unknown"

1392

1393 # Detect specific error types

1394 if "timeout" in error_msg.lower(): 1394 ↛ 1395line 1394 didn't jump to line 1395 because the condition on line 1394 was never true

1395 error_type = "timeout"

1396 error_info = {

1397 "type": "timeout",

1398 "message": "LLM service timed out during synthesis. This may be due to high server load or connectivity issues.",

1399 "suggestion": "Try again later or use a smaller query scope.",

1400 }

1401 elif ( 1401 ↛ 1405line 1401 didn't jump to line 1405 because the condition on line 1401 was never true

1402 "token limit" in error_msg.lower()

1403 or "context length" in error_msg.lower()

1404 ):

1405 error_type = "token_limit"

1406 error_info = {

1407 "type": "token_limit",

1408 "message": "The research query exceeded the AI model's token limit during synthesis.",

1409 "suggestion": "Try using a more specific query or reduce the research scope.",

1410 }

1411 elif ( 1411 ↛ 1415line 1411 didn't jump to line 1415 because the condition on line 1411 was never true

1412 "final answer synthesis fail" in error_msg.lower()

1413 or "llm error" in error_msg.lower()

1414 ):

1415 error_type = "llm_error"

1416 error_info = {

1417 "type": "llm_error",

1418 "message": "The AI model encountered an error during final answer synthesis.",

1419 "suggestion": "Check that your LLM service is running correctly or try a different model.",

1420 }

1421 elif "ollama" in error_msg.lower(): 1421 ↛ 1428line 1421 didn't jump to line 1428 because the condition on line 1421 was always true

1422 error_type = "ollama_error"

1423 error_info = {

1424 "type": "ollama_error",

1425 "message": "The Ollama service is not responding properly.",

1426 "suggestion": "Make sure Ollama is running with 'ollama serve' and the model is downloaded.",

1427 }

1428 elif "connection" in error_msg.lower():

1429 error_type = "connection"

1430 error_info = {

1431 "type": "connection",

1432 "message": "Connection error with the AI service.",

1433 "suggestion": "Check your internet connection and AI service status.",

1434 }

1435 elif metadata.get("solution"):

1436 # Use the solution provided in metadata if available

1437 error_info = {

1438 "type": error_type,

1439 "message": error_msg,

1440 "suggestion": metadata.get("solution"),

1441 }

1442 else:

1443 # Generic error with the original message

1444 error_info = {

1445 "type": error_type,

1446 "message": error_msg,

1447 "suggestion": "Try again with a different query or check the application logs.",

1448 }

1449

1450 # Get the latest milestone log for this research

1451 latest_milestone = None

1452 try:

1453 milestone_log = (

1454 db_session.query(ResearchLog)

1455 .filter_by(research_id=research_id, level="MILESTONE")

1456 .order_by(ResearchLog.timestamp.desc())

1457 .first()

1458 )

1459 if milestone_log: 1459 ↛ 1460line 1459 didn't jump to line 1460 because the condition on line 1459 was never true

1460 latest_milestone = {

1461 "message": milestone_log.message,

1462 "time": milestone_log.timestamp.isoformat()

1463 if milestone_log.timestamp

1464 else None,

1465 "type": "MILESTONE",

1466 }

1467 logger.debug(

1468 f"Found latest milestone for research {research_id}: {milestone_log.message}"

1469 )

1470 else:

1471 logger.debug(

1472 f"No milestone logs found for research {research_id}"

1473 )

1474 except Exception as e:

1475 logger.warning(f"Error fetching latest milestone: {e!s}")

1476

1477 filtered_metadata = strip_settings_snapshot(metadata)

1478 if error_info: 1478 ↛ 1481line 1478 didn't jump to line 1481 because the condition on line 1478 was always true

1479 filtered_metadata["error_info"] = error_info

1480

1481 response_data = {

1482 "status": status,

1483 "progress": progress,

1484 "completed_at": completed_at,

1485 "report_path": report_path,

1486 "metadata": filtered_metadata,

1487 }

1488

1489 # Include latest milestone as a log_entry for frontend compatibility

1490 if latest_milestone: 1490 ↛ 1491line 1490 didn't jump to line 1491 because the condition on line 1490 was never true

1491 response_data["log_entry"] = latest_milestone

1492

1493 return jsonify(response_data)

1494 except Exception:

1495 logger.exception("Error getting research status")

1496 return jsonify({"error": "Error checking research status"}), 500

1497

1498

1499@research_bp.route("/api/queue/status", methods=["GET"])

1500@login_required

1501def get_queue_status():

1502 """Get the current queue status for the user"""

1503 username = session.get("username")

1504

1505 from ..queue import QueueManager

1506

1507 try:

1508 queue_items = QueueManager.get_user_queue(username)

1509

1510 return jsonify(

1511 {

1512 "status": "success",

1513 "queue": queue_items,

1514 "total": len(queue_items),

1515 }

1516 )

1517 except Exception:

1518 logger.exception("Error getting queue status")

1519 return jsonify(

1520 {"status": "error", "message": "Failed to process request"}

1521 ), 500

1522

1523

1524@research_bp.route("/api/queue/<string:research_id>/position", methods=["GET"])

1525@login_required

1526def get_queue_position(research_id):

1527 """Get the queue position for a specific research"""

1528 username = session.get("username")

1529

1530 from ..queue import QueueManager

1531

1532 try:

1533 position = QueueManager.get_queue_position(username, research_id)

1534

1535 if position is None:

1536 return jsonify(

1537 {"status": "error", "message": "Research not found in queue"}

1538 ), 404

1539

1540 return jsonify({"status": "success", "position": position})

1541 except Exception:

1542 logger.exception("Error getting queue position")

1543 return jsonify(

1544 {"status": "error", "message": "Failed to process request"}

1545 ), 500

1546

1547

1548@research_bp.route("/api/config/limits", methods=["GET"])

1549def get_upload_limits():

1550 """

1551 Get file upload configuration limits.

1552

1553 Returns the backend's authoritative limits for file uploads,

1554 allowing the frontend to stay in sync without hardcoding values.

1555 """

1556 return jsonify(

1557 {

1558 "max_file_size": FileUploadValidator.MAX_FILE_SIZE,

1559 "max_files": FileUploadValidator.MAX_FILES_PER_REQUEST,

1560 "allowed_mime_types": list(FileUploadValidator.ALLOWED_MIME_TYPES),

1561 }

1562 )

1563

1564

1565@research_bp.route("/api/upload/pdf", methods=["POST"])

1566@login_required

1567@upload_rate_limit

1568def upload_pdf():

1569 """

1570 Upload and extract text from PDF files with comprehensive security validation.

1571

1572 Security features:

1573 - Rate limiting (10 uploads/min, 100/hour per user)

1574 - File size validation (50MB max per file)

1575 - File count validation (100 files max)

1576 - PDF structure validation

1577 - MIME type validation

1578

1579 Performance improvements:

1580 - Single-pass PDF processing (text + metadata)

1581 - Optimized extraction service

1582 """

1583 username = session.get("username")

1584 if not username: 1584 ↛ 1585line 1584 didn't jump to line 1585 because the condition on line 1584 was never true

1585 return jsonify({"error": "Not authenticated"}), 401

1586

1587 try:

1588 # Early request size validation (before reading any files)

1589 # This prevents memory exhaustion from chunked encoding attacks

1590 max_request_size = (

1591 FileUploadValidator.MAX_FILES_PER_REQUEST

1592 * FileUploadValidator.MAX_FILE_SIZE

1593 )

1594 if request.content_length and request.content_length > max_request_size: 1594 ↛ 1595line 1594 didn't jump to line 1595 because the condition on line 1594 was never true

1595 return jsonify(

1596 {

1597 "error": f"Request too large. Maximum size is {max_request_size // (1024 * 1024)}MB"

1598 }

1599 ), 413

1600

1601 # Check if files are present in the request

1602 if "files" not in request.files:

1603 return jsonify({"error": "No files provided"}), 400

1604

1605 files = request.files.getlist("files")

1606 if not files or files[0].filename == "":

1607 return jsonify({"error": "No files selected"}), 400

1608

1609 # Validate file count

1610 is_valid, error_msg = FileUploadValidator.validate_file_count(

1611 len(files)

1612 )

1613 if not is_valid:

1614 return jsonify({"error": error_msg}), 400

1615

1616 # Get PDF extraction service

1617 pdf_service = get_pdf_extraction_service()

1618

1619 extracted_texts = []

1620 total_files = len(files)

1621 processed_files = 0

1622 errors = []

1623

1624 for file in files:

1625 if not file or not file.filename: 1625 ↛ 1626line 1625 didn't jump to line 1626 because the condition on line 1625 was never true

1626 errors.append("Unnamed file: Skipped")

1627 continue

1628

1629 try:

1630 # Read file content (with disk spooling, large files are read from temp file)

1631 pdf_content = file.read()

1632

1633 # Comprehensive validation

1634 is_valid, error_msg = FileUploadValidator.validate_upload(

1635 filename=file.filename,

1636 file_content=pdf_content,

1637 content_length=file.content_length,

1638 )

1639

1640 if not is_valid: 1640 ↛ 1645line 1640 didn't jump to line 1645 because the condition on line 1640 was always true

1641 errors.append(f"{file.filename}: {error_msg}")

1642 continue

1643

1644 # Extract text and metadata in single pass (performance fix)

1645 result = pdf_service.extract_text_and_metadata(

1646 pdf_content, file.filename

1647 )

1648

1649 if result["success"]:

1650 extracted_texts.append(

1651 {

1652 "filename": result["filename"],

1653 "text": result["text"],

1654 "size": result["size"],

1655 "pages": result["pages"],

1656 }

1657 )

1658 processed_files += 1

1659 else:

1660 errors.append(f"{file.filename}: {result['error']}")

1661

1662 except Exception:

1663 logger.exception(f"Error processing {file.filename}")

1664 errors.append(f"{file.filename}: Error processing file")

1665 finally:

1666 # Close the file stream to release resources

1667 try:

1668 file.close()

1669 except Exception:

1670 pass

1671

1672 # Prepare response

1673 response_data = {

1674 "status": "success",

1675 "processed_files": processed_files,

1676 "total_files": total_files,

1677 "extracted_texts": extracted_texts,

1678 "combined_text": "\n\n".join(

1679 [

1680 f"--- From {item['filename']} ---\n{item['text']}"

1681 for item in extracted_texts

1682 ]

1683 ),

1684 "errors": errors,

1685 }

1686

1687 if processed_files == 0: 1687 ↛ 1696line 1687 didn't jump to line 1696 because the condition on line 1687 was always true

1688 return jsonify(

1689 {

1690 "status": "error",

1691 "message": "No files were processed successfully",

1692 "errors": errors,

1693 }

1694 ), 400

1695

1696 return jsonify(response_data)

1697

1698 except Exception:

1699 logger.exception("Error processing PDF upload")

1700 return jsonify({"error": "Failed to process PDF files"}), 500

Coverage for src / local_deep_research / web / routes / research_routes.py: 53%

683 statements