Coverage for src / local_deep_research / web / routes / research_routes.py: 53%
683 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1import io
2import json
3from datetime import datetime, UTC
4from pathlib import Path
6from flask import (
7 Blueprint,
8 g,
9 jsonify,
10 redirect,
11 request,
12 send_file,
13 session,
14 url_for,
15)
16from loguru import logger
17from ...settings.logger import log_settings
18from sqlalchemy import func
20# Security imports
21from ...constants import ResearchStatus
22from ...security import (
23 FileUploadValidator,
24 filter_research_metadata,
25 strip_settings_snapshot,
26 upload_rate_limit,
27)
28from ...config.paths import get_config_directory
30# Services imports
31from ..services.pdf_extraction_service import get_pdf_extraction_service
33from ...database.models import (
34 QueuedResearch,
35 ResearchHistory,
36 ResearchLog,
37 UserActiveResearch,
38)
39from ...database.models.library import Document as Document
40from ...database.session_context import get_user_db_session
41from ..auth.decorators import login_required
42from ..models.database import calculate_duration
43from ..services.research_service import (
44 export_report_to_memory,
45 run_research_process,
46 start_research_process,
47)
48from ..utils.rate_limiter import limiter
49from ..utils.templates import render_template_with_defaults
50from .globals import active_research, termination_flags
52# Create a Blueprint for the research application
53research_bp = Blueprint("research", __name__)
56# Add static route at the root level
57@research_bp.route("/redirect-static/<path:path>")
58def redirect_static(path):
59 """Redirect old static URLs to new static URLs"""
60 return redirect(url_for("static", filename=path))
63@research_bp.route("/progress/<string:research_id>")
64@login_required
65def progress_page(research_id):
66 """Render the research progress page"""
67 return render_template_with_defaults("pages/progress.html")
70@research_bp.route("/details/<string:research_id>")
71@login_required
72def research_details_page(research_id):
73 """Render the research details page"""
74 return render_template_with_defaults("pages/details.html")
77@research_bp.route("/results/<string:research_id>")
78@login_required
79def results_page(research_id):
80 """Render the research results page"""
81 return render_template_with_defaults("pages/results.html")
84@research_bp.route("/history")
85@login_required
86def history_page():
87 """Render the history page"""
88 return render_template_with_defaults("pages/history.html")
91# Add missing settings routes
92@research_bp.route("/settings", methods=["GET"])
93@login_required
94def settings_page():
95 """Render the settings page"""
96 return render_template_with_defaults("settings_dashboard.html")
99@research_bp.route("/settings/main", methods=["GET"])
100@login_required
101def main_config_page():
102 """Render the main settings config page"""
103 return render_template_with_defaults("main_config.html")
106@research_bp.route("/settings/collections", methods=["GET"])
107@login_required
108def collections_config_page():
109 """Render the collections config page"""
110 return render_template_with_defaults("collections_config.html")
113@research_bp.route("/settings/api_keys", methods=["GET"])
114@login_required
115def api_keys_config_page():
116 """Render the API keys config page"""
117 return render_template_with_defaults("api_keys_config.html")
120@research_bp.route("/settings/search_engines", methods=["GET"])
121@login_required
122def search_engines_config_page():
123 """Render the search engines config page"""
124 return render_template_with_defaults("search_engines_config.html")
127@research_bp.route("/settings/llm", methods=["GET"])
128@login_required
129def llm_config_page():
130 """Render the LLM config page"""
131 return render_template_with_defaults("llm_config.html")
134@research_bp.route("/api/start_research", methods=["POST"])
135@login_required
136def start_research():
137 data = request.json
138 # Debug logging to trace model parameter
139 logger.debug(f"Request data keys: {list(data.keys()) if data else 'None'}")
141 # Check if this is a news search
142 metadata = data.get("metadata", {})
143 if metadata.get("is_news_search"): 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 logger.info(
145 f"News search request received: triggered_by={metadata.get('triggered_by', 'unknown')}"
146 )
148 query = data.get("query")
149 mode = data.get("mode", "quick")
151 # Replace date placeholders if they exist
152 if query and "YYYY-MM-DD" in query: 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was never true
153 # Use local system time
154 current_date = datetime.now(UTC).strftime("%Y-%m-%d")
156 original_query = query
157 query = query.replace("YYYY-MM-DD", current_date)
158 logger.info(
159 f"Replaced date placeholder in query: {original_query[:100]}... -> {query[:100]}..."
160 )
161 logger.info(f"Using date: {current_date}")
163 # Update metadata to track the replacement
164 if not metadata:
165 metadata = {}
166 metadata["original_query"] = original_query
167 metadata["processed_query"] = query
168 metadata["date_replaced"] = current_date
169 data["metadata"] = metadata
171 # Get parameters from request or use database settings
172 from ...settings.manager import SettingsManager
174 username = session.get("username")
175 if not username: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 return jsonify({"error": "Not authenticated"}), 401
178 with get_user_db_session(username) as db_session:
179 settings_manager = SettingsManager(db_session=db_session)
181 # Get model provider and model selections - use database settings if not provided
182 model_provider = data.get("model_provider")
183 if not model_provider:
184 model_provider = settings_manager.get_setting(
185 "llm.provider", "OLLAMA"
186 )
187 logger.debug(
188 f"No model_provider in request, using database setting: {model_provider}"
189 )
190 else:
191 logger.debug(f"Using model_provider from request: {model_provider}")
193 model = data.get("model")
194 if not model:
195 model = settings_manager.get_setting("llm.model", None)
196 logger.debug(
197 f"No model in request, using database setting: {model}"
198 )
199 else:
200 logger.debug(f"Using model from request: {model}")
202 custom_endpoint = data.get("custom_endpoint")
203 if not custom_endpoint and model_provider == "OPENAI_ENDPOINT": 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 custom_endpoint = settings_manager.get_setting(
205 "llm.openai_endpoint.url", None
206 )
207 logger.debug(
208 f"No custom_endpoint in request, using database setting: {custom_endpoint}"
209 )
211 # Get Ollama URL from request or settings
212 ollama_url = data.get("ollama_url")
213 if not ollama_url and model_provider == "OLLAMA": 213 ↛ 221line 213 didn't jump to line 221 because the condition on line 213 was always true
214 ollama_url = settings_manager.get_setting(
215 "llm.ollama.url", "http://localhost:11434"
216 )
217 logger.debug(
218 f"No ollama_url in request, using database setting: {ollama_url}"
219 )
221 search_engine = data.get("search_engine") or data.get("search_tool")
222 if not search_engine:
223 search_engine = settings_manager.get_setting(
224 "search.tool", "searxng"
225 )
227 max_results = data.get("max_results")
228 time_period = data.get("time_period")
230 iterations = data.get("iterations")
231 if iterations is None:
232 iterations = settings_manager.get_setting("search.iterations", 5)
234 questions_per_iteration = data.get("questions_per_iteration")
235 if questions_per_iteration is None:
236 questions_per_iteration = settings_manager.get_setting(
237 "search.questions_per_iteration", 5
238 )
240 # Get strategy from request or database
241 strategy = data.get("strategy")
242 if not strategy:
243 strategy = settings_manager.get_setting(
244 "search.search_strategy", "source-based"
245 )
247 # Debug logging for model parameter specifically
248 logger.debug(
249 f"Extracted model value: '{model}' (type: {type(model).__name__})"
250 )
252 # Log the selections for troubleshooting
253 logger.info(
254 f"Starting research with provider: {model_provider}, model: {model}, search engine: {search_engine}"
255 )
256 logger.info(
257 f"Additional parameters: max_results={max_results}, time_period={time_period}, iterations={iterations}, questions={questions_per_iteration}, strategy={strategy}"
258 )
260 if not query:
261 return jsonify({"status": "error", "message": "Query is required"}), 400
263 # Validate required parameters based on provider
264 if model_provider == "OPENAI_ENDPOINT" and not custom_endpoint: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true
265 return (
266 jsonify(
267 {
268 "status": "error",
269 "message": "Custom endpoint URL is required for OpenAI endpoint provider",
270 }
271 ),
272 400,
273 )
275 if not model: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true
276 logger.error(
277 f"No model specified or configured. Provider: {model_provider}"
278 )
279 return jsonify(
280 {
281 "status": "error",
282 "message": "Model is required. Please configure a model in the settings.",
283 }
284 ), 400
286 # Check if the user has too many active researches
287 username = session.get("username")
289 # Get max concurrent researches from settings
290 from ...settings import SettingsManager
292 with get_user_db_session() as db_session:
293 settings_manager = SettingsManager(db_session)
294 max_concurrent_researches = settings_manager.get_setting(
295 "app.max_concurrent_researches", 3
296 )
298 # Use existing session from g to check active researches
299 try:
300 if hasattr(g, "db_session") and g.db_session: 300 ↛ 316line 300 didn't jump to line 316 because the condition on line 300 was always true
301 # Count active researches for this user
302 active_count = (
303 g.db_session.query(UserActiveResearch)
304 .filter_by(username=username, status=ResearchStatus.IN_PROGRESS)
305 .count()
306 )
308 # Debug logging
309 logger.info(
310 f"Active research count for {username}: {active_count}/{max_concurrent_researches}"
311 )
313 should_queue = active_count >= max_concurrent_researches
314 logger.info(f"Should queue new research: {should_queue}")
315 else:
316 logger.warning(
317 "No database session available to check active researches"
318 )
319 should_queue = False
320 except Exception:
321 logger.exception("Failed to check active researches")
322 # Default to not queueing if we can't check
323 should_queue = False
325 # Create a record in the database with explicit UTC timestamp
326 import uuid
327 import threading
329 created_at = datetime.now(UTC).isoformat()
330 research_id = str(uuid.uuid4())
332 # Create organized research metadata with settings snapshot
333 research_settings = {
334 # Direct submission parameters
335 "submission": {
336 "model_provider": model_provider,
337 "model": model,
338 "custom_endpoint": custom_endpoint,
339 "search_engine": search_engine,
340 "max_results": max_results,
341 "time_period": time_period,
342 "iterations": iterations,
343 "questions_per_iteration": questions_per_iteration,
344 "strategy": strategy,
345 },
346 # System information
347 "system": {
348 "timestamp": created_at,
349 "user": username,
350 "version": "1.0", # Track metadata version for future migrations
351 "server_url": request.host_url, # Add server URL for link generation
352 },
353 }
355 # Add any additional metadata from request
356 additional_metadata = data.get("metadata", {})
357 if additional_metadata: 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true
358 research_settings.update(additional_metadata)
359 # Get complete settings snapshot for this research
360 try:
361 from local_deep_research.settings import SettingsManager
363 # Use the existing session from g (set by middleware)
364 if hasattr(g, "db_session") and g.db_session: 364 ↛ 383line 364 didn't jump to line 383 because the condition on line 364 was always true
365 # Create SettingsManager with the existing session
366 username = session.get("username")
367 # Ensure any pending changes are committed
368 try:
369 g.db_session.commit()
370 except Exception:
371 g.db_session.rollback()
372 settings_manager = SettingsManager(g.db_session)
373 # Get all current settings as a snapshot (bypass cache to ensure fresh data)
374 all_settings = settings_manager.get_all_settings(bypass_cache=True)
376 # Add settings snapshot to metadata
377 research_settings["settings_snapshot"] = all_settings
378 logger.info(
379 f"Captured {len(all_settings)} settings for research {research_id}"
380 )
381 else:
382 # If no session in g, create a new one temporarily to get settings
383 logger.warning(
384 "No database session in g, creating temporary session for settings snapshot"
385 )
386 from ...database.thread_local_session import get_metrics_session
388 # Get password from session or g
389 password = getattr(g, "user_password", None)
390 if not password:
391 # Try to get from session password store
392 from ...database.session_passwords import session_password_store
394 session_id = session.get("session_id")
395 if session_id:
396 password = session_password_store.get_session_password(
397 username, session_id
398 )
400 if password:
401 temp_session = get_metrics_session(username, password)
402 if temp_session:
403 username = session.get("username")
404 settings_manager = SettingsManager(temp_session)
405 all_settings = settings_manager.get_all_settings(
406 bypass_cache=True
407 )
408 research_settings["settings_snapshot"] = all_settings
409 logger.info(
410 f"Captured {len(all_settings)} settings using temporary session for research {research_id}"
411 )
412 else:
413 logger.error(
414 "Failed to create temporary session for settings snapshot"
415 )
416 raise Exception(
417 "Cannot create research without settings snapshot"
418 )
419 else:
420 logger.error(
421 "No password available to create session for settings snapshot"
422 )
423 raise Exception(
424 "Cannot create research without settings snapshot"
425 )
426 except Exception:
427 logger.exception("Failed to capture settings snapshot")
428 # Cannot continue without settings snapshot for thread-based research
429 return jsonify(
430 {
431 "status": "error",
432 "message": "Failed to capture settings for research. Please try again.",
433 }
434 ), 500
436 # Use existing session from g
437 username = session.get("username")
438 if not username: 438 ↛ 439line 438 didn't jump to line 439 because the condition on line 438 was never true
439 return jsonify({"status": "error", "message": "Not authenticated"}), 401
441 try:
442 # Use existing session from g
443 if hasattr(g, "db_session") and g.db_session: 443 ↛ 633line 443 didn't jump to line 633 because the condition on line 443 was always true
444 db_session = g.db_session
445 # Determine initial status based on whether we need to queue
446 initial_status = (
447 ResearchStatus.QUEUED
448 if should_queue
449 else ResearchStatus.IN_PROGRESS
450 )
452 research = ResearchHistory(
453 id=research_id, # Set UUID as primary key
454 query=query,
455 mode=mode,
456 status=initial_status,
457 created_at=created_at,
458 progress_log=[{"time": created_at, "progress": 0}],
459 research_meta=research_settings,
460 )
461 db_session.add(research)
462 db_session.commit()
463 logger.info(
464 f"Created research entry with UUID: {research_id}, status: {initial_status}"
465 )
467 if should_queue: 467 ↛ 470line 467 didn't jump to line 470 because the condition on line 467 was never true
468 # Add to queue instead of starting immediately
469 # Get the next position in queue for this user
470 max_position = (
471 db_session.query(func.max(QueuedResearch.position))
472 .filter_by(username=username)
473 .scalar()
474 or 0
475 )
477 queued_record = QueuedResearch(
478 username=username,
479 research_id=research_id,
480 query=query,
481 mode=mode,
482 settings_snapshot=research_settings,
483 position=max_position + 1,
484 )
485 db_session.add(queued_record)
486 db_session.commit()
487 logger.info(
488 f"Queued research {research_id} at position {max_position + 1} for user {username}"
489 )
491 # Notify queue processor with all parameters for potential direct execution
492 from ..queue.processor_v2 import queue_processor
494 # Get session ID for password access
495 session_id = session.get("session_id")
497 # Pass all parameters needed for direct execution
498 queue_processor.notify_research_queued(
499 username,
500 research_id,
501 session_id=session_id,
502 query=query,
503 mode=mode,
504 settings_snapshot=research_settings,
505 model_provider=model_provider,
506 model=model,
507 custom_endpoint=custom_endpoint,
508 search_engine=search_engine,
509 max_results=max_results,
510 time_period=time_period,
511 iterations=iterations,
512 questions_per_iteration=questions_per_iteration,
513 strategy=strategy,
514 )
516 # Return queued status
517 return jsonify(
518 {
519 "status": ResearchStatus.QUEUED,
520 "research_id": research_id,
521 "queue_position": max_position + 1,
522 "message": f"Your research has been queued. Position in queue: {max_position + 1}",
523 }
524 )
525 else:
526 # Start immediately
527 # Create active research tracking record
528 import threading
530 active_record = UserActiveResearch(
531 username=username,
532 research_id=research_id,
533 status=ResearchStatus.IN_PROGRESS,
534 thread_id=str(threading.current_thread().ident),
535 settings_snapshot=research_settings,
536 )
537 db_session.add(active_record)
538 db_session.commit()
539 logger.info(
540 f"Created active research record for user {username}"
541 )
543 # Double-check the count after committing to handle race conditions
544 # Use the existing session for the recheck
545 try:
546 # Use the same session we already have
547 recheck_session = db_session
548 final_count = (
549 recheck_session.query(UserActiveResearch)
550 .filter_by(
551 username=username, status=ResearchStatus.IN_PROGRESS
552 )
553 .count()
554 )
555 logger.info(
556 f"Final active count after commit: {final_count}/{max_concurrent_researches}"
557 )
559 if final_count > max_concurrent_researches: 559 ↛ 562line 559 didn't jump to line 562 because the condition on line 559 was never true
560 # We exceeded the limit due to a race condition
561 # Remove this record and queue instead
562 logger.warning(
563 f"Race condition detected: {final_count} > {max_concurrent_researches}, moving to queue"
564 )
565 db_session.delete(active_record)
566 db_session.commit()
568 # Add to queue
569 max_position = (
570 db_session.query(func.max(QueuedResearch.position))
571 .filter_by(username=username)
572 .scalar()
573 or 0
574 )
576 queued_record = QueuedResearch(
577 username=username,
578 research_id=research_id,
579 query=query,
580 mode=mode,
581 settings_snapshot=research_settings,
582 position=max_position + 1,
583 )
584 db_session.add(queued_record)
586 # Update research status to queued
587 research.status = ResearchStatus.QUEUED
588 db_session.commit()
590 # Notify queue processor for potential direct execution
591 from ..queue.processor_v2 import queue_processor
593 # Get session ID for password access
594 session_id = session.get("session_id")
596 # Pass all parameters needed for direct execution
597 queue_processor.notify_research_queued(
598 username,
599 research_id,
600 session_id=session_id,
601 query=query,
602 mode=mode,
603 settings_snapshot=research_settings,
604 model_provider=model_provider,
605 model=model,
606 custom_endpoint=custom_endpoint,
607 search_engine=search_engine,
608 max_results=max_results,
609 time_period=time_period,
610 iterations=iterations,
611 questions_per_iteration=questions_per_iteration,
612 strategy=strategy,
613 )
615 return jsonify(
616 {
617 "status": ResearchStatus.QUEUED,
618 "research_id": research_id,
619 "queue_position": max_position + 1,
620 "message": f"Your research has been queued due to concurrent limit. Position in queue: {max_position + 1}",
621 }
622 )
623 except Exception as e:
624 logger.warning(f"Could not recheck active count: {e}")
626 except Exception:
627 logger.exception("Failed to create research entry")
628 return jsonify(
629 {"status": "error", "message": "Failed to create research entry"}
630 ), 500
632 # Only start the research if not queued
633 if not should_queue: 633 ↛ 720line 633 didn't jump to line 720 because the condition on line 633 was always true
634 # Save the research strategy to the database before starting the thread
635 try:
636 from ..services.research_service import save_research_strategy
638 save_research_strategy(research_id, strategy, username=username)
639 except Exception as e:
640 logger.warning(f"Could not save research strategy: {e}")
642 # Debug logging for settings snapshot
643 snapshot_data = research_settings.get("settings_snapshot", {})
644 log_settings(snapshot_data, "Settings snapshot being passed to thread")
645 if "search.tool" in snapshot_data: 645 ↛ 650line 645 didn't jump to line 650 because the condition on line 645 was always true
646 logger.debug(
647 f"search.tool in snapshot: {snapshot_data['search.tool']}"
648 )
649 else:
650 logger.debug("search.tool NOT in snapshot")
652 # Get the user's password for metrics access in background thread
653 # Try session password store first
654 from ...database.session_passwords import session_password_store
656 session_id = session.get("session_id")
657 user_password = None
659 if session_id: 659 ↛ 665line 659 didn't jump to line 665 because the condition on line 659 was always true
660 user_password = session_password_store.get_session_password(
661 username, session_id
662 )
664 # Fallback to g.user_password (set by middleware if temp_auth was used)
665 if not user_password: 665 ↛ 666line 665 didn't jump to line 666 because the condition on line 665 was never true
666 user_password = getattr(g, "user_password", None)
668 # Last resort: try temp_auth_store
669 if not user_password: 669 ↛ 670line 669 didn't jump to line 670 because the condition on line 669 was never true
670 from ...database.temp_auth import temp_auth_store
672 auth_token = session.get("temp_auth_token")
673 if auth_token:
674 # Use peek_auth to avoid consuming the token
675 auth_data = temp_auth_store.peek_auth(auth_token)
676 if auth_data and auth_data[0] == username:
677 user_password = auth_data[1]
679 if not user_password: 679 ↛ 680line 679 didn't jump to line 680 because the condition on line 679 was never true
680 logger.warning(
681 f"No password available for metrics access for user {username}"
682 )
684 # Start the research process with the selected parameters
685 research_thread = start_research_process(
686 research_id,
687 query,
688 mode,
689 active_research,
690 termination_flags,
691 run_research_process,
692 username=username, # Pass username to the thread
693 user_password=user_password, # Pass password for database access
694 model_provider=model_provider,
695 model=model,
696 custom_endpoint=custom_endpoint,
697 search_engine=search_engine,
698 max_results=max_results,
699 time_period=time_period,
700 iterations=iterations,
701 questions_per_iteration=questions_per_iteration,
702 strategy=strategy,
703 settings_snapshot=snapshot_data, # Pass complete settings
704 )
706 # Update the active research record with the actual thread ID
707 try:
708 with get_user_db_session(username) as thread_session:
709 active_record = (
710 thread_session.query(UserActiveResearch)
711 .filter_by(username=username, research_id=research_id)
712 .first()
713 )
714 if active_record: 714 ↛ 720line 714 didn't jump to line 720
715 active_record.thread_id = str(research_thread.ident)
716 thread_session.commit()
717 except Exception as e:
718 logger.warning(f"Could not update thread ID: {e}")
720 return jsonify({"status": "success", "research_id": research_id})
723@research_bp.route("/api/terminate/<string:research_id>", methods=["POST"])
724@login_required
725def terminate_research(research_id):
726 """Terminate an in-progress research process"""
727 username = session.get("username")
728 if not username: 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true
729 return jsonify({"error": "Not authenticated"}), 401
731 # Check if the research exists and is in progress
732 try:
733 with get_user_db_session(username) as db_session:
734 research = (
735 db_session.query(ResearchHistory)
736 .filter_by(id=research_id)
737 .first()
738 )
740 if not research: 740 ↛ 741line 740 didn't jump to line 741 because the condition on line 740 was never true
741 return jsonify(
742 {"status": "error", "message": "Research not found"}
743 ), 404
745 status = research.status
747 # If it's already in a terminal state, return success
748 if status in (
749 ResearchStatus.COMPLETED,
750 ResearchStatus.SUSPENDED,
751 ResearchStatus.FAILED,
752 ResearchStatus.ERROR,
753 ):
754 return jsonify(
755 {
756 "status": "success",
757 "message": f"Research already {status}",
758 }
759 )
761 # Check if it's in the active_research dict
762 if research_id not in active_research:
763 # Update the status in the database
764 research.status = ResearchStatus.SUSPENDED
765 db_session.commit()
766 return jsonify(
767 {"status": "success", "message": "Research terminated"}
768 )
770 # Set the termination flag
771 termination_flags[research_id] = True
773 # Log the termination request - using UTC timestamp
774 timestamp = datetime.now(UTC).isoformat()
775 termination_message = "Research termination requested by user"
776 current_progress = active_research[research_id]["progress"]
778 # Create log entry
779 log_entry = {
780 "time": timestamp,
781 "message": termination_message,
782 "progress": current_progress,
783 "metadata": {"phase": "termination"},
784 }
786 # Add to in-memory log
787 active_research[research_id]["log"].append(log_entry)
789 # Add to database log
790 logger.log("MILESTONE", f"Research ended: {termination_message}")
792 # Update the log in the database
793 if research.progress_log: 793 ↛ 802line 793 didn't jump to line 802 because the condition on line 793 was always true
794 try:
795 if isinstance(research.progress_log, str): 795 ↛ 796line 795 didn't jump to line 796 because the condition on line 795 was never true
796 current_log = json.loads(research.progress_log)
797 else:
798 current_log = research.progress_log
799 except Exception:
800 current_log = []
801 else:
802 current_log = []
804 current_log.append(log_entry)
805 research.progress_log = current_log
806 research.status = ResearchStatus.SUSPENDED
807 db_session.commit()
809 # Emit a socket event for the termination request
810 try:
811 event_data = {
812 "status": ResearchStatus.SUSPENDED,
813 "message": "Research was suspended by user request",
814 }
816 from ..services.socket_service import SocketIOService
818 SocketIOService().emit_socket_event(
819 f"research_progress_{research_id}", event_data
820 )
822 except Exception:
823 logger.exception("Socket emit error (non-critical)")
825 return jsonify(
826 {
827 "status": "success",
828 "message": "Research termination requested",
829 }
830 )
831 except Exception:
832 logger.exception("Error terminating research")
833 return jsonify(
834 {"status": "error", "message": "Failed to terminate research"}
835 ), 500
838@research_bp.route("/api/delete/<string:research_id>", methods=["DELETE"])
839@login_required
840def delete_research(research_id):
841 """Delete a research record"""
842 username = session.get("username")
843 if not username: 843 ↛ 844line 843 didn't jump to line 844 because the condition on line 843 was never true
844 return jsonify({"error": "Not authenticated"}), 401
846 try:
847 with get_user_db_session(username) as db_session:
848 research = (
849 db_session.query(ResearchHistory)
850 .filter_by(id=research_id)
851 .first()
852 )
854 if not research: 854 ↛ 855line 854 didn't jump to line 855 because the condition on line 854 was never true
855 return jsonify(
856 {"status": "error", "message": "Research not found"}
857 ), 404
859 status = research.status
860 report_path = research.report_path
862 # Don't allow deleting research in progress
863 if ( 863 ↛ 867line 863 didn't jump to line 867 because the condition on line 863 was never true
864 status == ResearchStatus.IN_PROGRESS
865 and research_id in active_research
866 ):
867 return (
868 jsonify(
869 {
870 "status": "error",
871 "message": "Cannot delete research that is in progress",
872 }
873 ),
874 400,
875 )
877 # Delete report file if it exists
878 if report_path and Path(report_path).exists(): 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true
879 try:
880 Path(report_path).unlink()
881 except Exception:
882 logger.exception("Error removing report file")
884 # Delete the database record
885 db_session.delete(research)
886 db_session.commit()
888 return jsonify({"status": "success"})
889 except Exception:
890 logger.exception("Error deleting research")
891 return jsonify(
892 {"status": "error", "message": "Failed to delete research"}
893 ), 500
896@research_bp.route("/api/clear_history", methods=["POST"])
897@login_required
898def clear_history():
899 """Clear all research history"""
900 username = session.get("username")
901 if not username: 901 ↛ 902line 901 didn't jump to line 902 because the condition on line 901 was never true
902 return jsonify({"error": "Not authenticated"}), 401
904 try:
905 with get_user_db_session(username) as db_session:
906 # Get all research records first to clean up files
907 research_records = db_session.query(ResearchHistory).all()
909 # Clean up report files
910 for research in research_records: 910 ↛ 912line 910 didn't jump to line 912 because the loop on line 910 never started
911 # Skip active research
912 if research.id in active_research:
913 continue
915 # Delete report file if it exists
916 if research.report_path and Path(research.report_path).exists():
917 try:
918 Path(research.report_path).unlink()
919 except Exception:
920 logger.exception("Error removing report file")
922 # Delete records from the database, except active research
923 if active_research: 923 ↛ 924line 923 didn't jump to line 924 because the condition on line 923 was never true
924 db_session.query(ResearchHistory).filter(
925 ~ResearchHistory.id.in_(list(active_research.keys()))
926 ).delete(synchronize_session=False)
927 else:
928 db_session.query(ResearchHistory).delete(
929 synchronize_session=False
930 )
932 db_session.commit()
934 return jsonify({"status": "success"})
935 except Exception:
936 logger.exception("Error clearing history")
937 return jsonify(
938 {"status": "error", "message": "Failed to process request"}
939 ), 500
942@research_bp.route("/open_file_location", methods=["POST"])
943@login_required
944def open_file_location():
945 """Open a file location in the system file explorer.
947 Security: This endpoint is disabled for server deployments.
948 It only makes sense for desktop usage where the server and client are on the same machine.
949 """
950 return jsonify(
951 {
952 "status": "error",
953 "message": "This feature is disabled. It is only available in desktop mode.",
954 }
955 ), 403
958@research_bp.route("/api/save_raw_config", methods=["POST"])
959@login_required
960def save_raw_config():
961 """Save raw configuration"""
962 data = request.json
963 raw_config = data.get("raw_config")
965 if not raw_config:
966 return (
967 jsonify(
968 {"success": False, "error": "Raw configuration is required"}
969 ),
970 400,
971 )
973 # Security: Parse and validate the TOML to block dangerous keys
974 try:
975 import tomllib
976 except ImportError:
977 import tomli as tomllib
979 try:
980 parsed_config = tomllib.loads(raw_config)
981 except Exception as e:
982 logger.warning(f"Invalid TOML configuration: {e}")
983 # Don't expose internal exception details to users (CWE-209)
984 return jsonify(
985 {
986 "success": False,
987 "error": "Invalid TOML syntax. Please check your configuration format.",
988 }
989 ), 400
991 # Security: Check for dangerous keys that could enable code execution
992 # These patterns match keys used for dynamic module imports
993 BLOCKED_KEY_PATTERNS = ["module_path", "class_name", "module", "class"]
995 def find_blocked_keys(obj, path=""):
996 """Recursively find any blocked keys in the config."""
997 blocked = []
998 if isinstance(obj, dict):
999 for key, value in obj.items():
1000 current_path = f"{path}.{key}" if path else key
1001 key_lower = key.lower()
1002 for pattern in BLOCKED_KEY_PATTERNS:
1003 if pattern in key_lower:
1004 blocked.append(current_path)
1005 break
1006 # Recurse into nested dicts
1007 blocked.extend(find_blocked_keys(value, current_path))
1008 elif isinstance(obj, list):
1009 for i, item in enumerate(obj):
1010 blocked.extend(find_blocked_keys(item, f"{path}[{i}]"))
1011 return blocked
1013 blocked_keys = find_blocked_keys(parsed_config)
1014 if blocked_keys:
1015 logger.warning(
1016 f"Security: Blocked attempt to write config with dangerous keys: {blocked_keys}"
1017 )
1018 return jsonify(
1019 {
1020 "success": False,
1021 "error": "Configuration contains protected keys that cannot be modified",
1022 "blocked_keys": blocked_keys,
1023 }
1024 ), 403
1026 try:
1027 from ...security.file_write_verifier import write_file_verified
1029 # Get the config file path (uses centralized path config, respects LDR_DATA_DIR)
1030 config_dir = get_config_directory()
1031 config_path = config_dir / "config.toml"
1033 # Write the configuration to file
1034 write_file_verified(
1035 config_path,
1036 raw_config,
1037 "system.allow_config_write",
1038 context="system configuration file",
1039 )
1041 return jsonify({"success": True})
1042 except Exception:
1043 logger.exception("Error saving configuration file")
1044 return jsonify(
1045 {"success": False, "error": "Failed to process request"}
1046 ), 500
1049@research_bp.route("/api/history", methods=["GET"])
1050@login_required
1051def get_history():
1052 """Get research history"""
1053 username = session.get("username")
1054 if not username: 1054 ↛ 1055line 1054 didn't jump to line 1055 because the condition on line 1054 was never true
1055 return jsonify({"error": "Not authenticated"}), 401
1057 try:
1058 with get_user_db_session(username) as db_session:
1059 # Query all research history ordered by created_at
1060 research_records = (
1061 db_session.query(ResearchHistory)
1062 .order_by(ResearchHistory.created_at.desc())
1063 .all()
1064 )
1066 # Build history items while session is active to avoid
1067 # DetachedInstanceError on ORM attribute access
1068 history_items = []
1069 for research in research_records: 1069 ↛ 1071line 1069 didn't jump to line 1071 because the loop on line 1069 never started
1070 # Calculate duration if completed
1071 duration_seconds = None
1072 if research.completed_at and research.created_at:
1073 try:
1074 duration_seconds = calculate_duration(
1075 research.created_at, research.completed_at
1076 )
1077 except Exception:
1078 logger.exception("Error calculating duration")
1080 # Count documents in the library for this research
1081 doc_count = (
1082 db_session.query(Document)
1083 .filter_by(research_id=research.id)
1084 .count()
1085 )
1087 # Create a history item
1088 item = {
1089 "id": research.id,
1090 "query": research.query,
1091 "mode": research.mode,
1092 "status": research.status,
1093 "created_at": research.created_at,
1094 "completed_at": research.completed_at,
1095 "duration_seconds": duration_seconds,
1096 "metadata": filter_research_metadata(
1097 research.research_meta
1098 ),
1099 "document_count": doc_count,
1100 }
1102 # Add title if it exists
1103 if hasattr(research, "title") and research.title is not None:
1104 item["title"] = research.title
1106 history_items.append(item)
1108 return jsonify({"status": "success", "items": history_items})
1109 except Exception:
1110 logger.exception("Error getting history")
1111 return jsonify(
1112 {"status": "error", "message": "Failed to process request"}
1113 ), 500
1116@research_bp.route("/api/research/<string:research_id>")
1117@login_required
1118def get_research_details(research_id):
1119 """Get full details of a research using ORM"""
1120 username = session.get("username")
1121 if not username: 1121 ↛ 1122line 1121 didn't jump to line 1122 because the condition on line 1121 was never true
1122 return jsonify({"error": "Not authenticated"}), 401
1124 try:
1125 with get_user_db_session(username) as db_session:
1126 research = (
1127 db_session.query(ResearchHistory)
1128 .filter(ResearchHistory.id == research_id)
1129 .first()
1130 )
1132 if not research:
1133 return jsonify({"error": "Research not found"}), 404
1135 return jsonify(
1136 {
1137 "id": research.id,
1138 "query": research.query,
1139 "status": research.status,
1140 "progress": research.progress,
1141 "progress_percentage": research.progress or 0,
1142 "mode": research.mode,
1143 "created_at": research.created_at,
1144 "completed_at": research.completed_at,
1145 "report_path": research.report_path,
1146 "metadata": strip_settings_snapshot(research.research_meta),
1147 }
1148 )
1149 except Exception:
1150 logger.exception("Error getting research details")
1151 return jsonify({"error": "An internal error has occurred"}), 500
1154@research_bp.route("/api/research/<string:research_id>/logs")
1155@login_required
1156def get_research_logs(research_id):
1157 """Get logs for a specific research"""
1158 username = session.get("username")
1159 if not username: 1159 ↛ 1160line 1159 didn't jump to line 1160 because the condition on line 1159 was never true
1160 return jsonify({"error": "Not authenticated"}), 401
1162 try:
1163 # First check if the research exists
1164 with get_user_db_session(username) as db_session:
1165 research = (
1166 db_session.query(ResearchHistory)
1167 .filter_by(id=research_id)
1168 .first()
1169 )
1170 if not research: 1170 ↛ 1174line 1170 didn't jump to line 1174 because the condition on line 1170 was always true
1171 return jsonify({"error": "Research not found"}), 404
1173 # Get logs from research_logs table
1174 log_results = (
1175 db_session.query(ResearchLog)
1176 .filter_by(research_id=research_id)
1177 .order_by(ResearchLog.timestamp)
1178 .all()
1179 )
1181 # Extract log attributes while session is active
1182 # to avoid DetachedInstanceError on ORM attribute access
1183 logs = []
1184 for row in log_results:
1185 logs.append(
1186 {
1187 "id": row.id,
1188 "message": row.message,
1189 "timestamp": row.timestamp,
1190 "log_type": row.level,
1191 }
1192 )
1194 return jsonify(logs)
1196 except Exception:
1197 logger.exception("Error getting research logs")
1198 return jsonify({"error": "An internal error has occurred"}), 500
1201@research_bp.route("/api/report/<string:research_id>")
1202@login_required
1203def get_research_report(research_id):
1204 """Get the research report content"""
1205 username = session.get("username")
1206 if not username:
1207 return jsonify({"error": "Not authenticated"}), 401
1209 try:
1210 with get_user_db_session(username) as db_session:
1211 # Query using ORM
1212 research = (
1213 db_session.query(ResearchHistory)
1214 .filter_by(id=research_id)
1215 .first()
1216 )
1218 if research is None:
1219 return jsonify({"error": "Research not found"}), 404
1221 # Parse metadata if it exists
1222 metadata = research.research_meta
1224 # Get report content using storage abstraction
1225 from ...storage import get_report_storage
1227 # Get settings snapshot for this thread
1228 settings_snapshot = (
1229 metadata.get("settings_snapshot") if metadata else None
1230 )
1232 # Pass settings_snapshot to avoid thread context issues
1233 storage = get_report_storage(
1234 session=db_session, settings_snapshot=settings_snapshot
1235 )
1236 content = storage.get_report(research_id, username)
1238 if content is None:
1239 return jsonify({"error": "Report not found"}), 404
1241 # Return the report data with backwards-compatible fields
1242 # Examples expect 'summary', 'sources', 'findings' at top level
1243 safe_metadata = strip_settings_snapshot(metadata)
1244 return jsonify(
1245 {
1246 "content": content,
1247 # Backwards-compatible fields for examples
1248 "summary": content, # The markdown report is the summary
1249 "sources": safe_metadata.get("all_links_of_system", []),
1250 "findings": safe_metadata.get("findings", []),
1251 "metadata": {
1252 "title": research.title if research.title else None,
1253 "query": research.query,
1254 "mode": research.mode if research.mode else None,
1255 "created_at": research.created_at
1256 if research.created_at
1257 else None,
1258 "completed_at": research.completed_at
1259 if research.completed_at
1260 else None,
1261 "report_path": research.report_path,
1262 **safe_metadata,
1263 },
1264 }
1265 )
1267 except Exception:
1268 logger.exception("Error getting research report")
1269 return jsonify({"error": "An internal error has occurred"}), 500
1272@research_bp.route(
1273 "/api/v1/research/<research_id>/export/<format>", methods=["POST"]
1274)
1275@login_required
1276def export_research_report(research_id, format):
1277 """Export research report to different formats (LaTeX, Quarto, RIS, PDF, ODT, etc.)"""
1278 try:
1279 # Use the exporter registry to validate format
1280 from ...exporters import ExporterRegistry
1282 if not ExporterRegistry.is_format_supported(format):
1283 available = ExporterRegistry.get_available_formats()
1284 return jsonify(
1285 {
1286 "error": f"Invalid format. Available formats: {', '.join(available)}"
1287 }
1288 ), 400
1290 # Get research from database
1291 username = session.get("username")
1292 if not username:
1293 return jsonify({"error": "Not authenticated"}), 401
1295 try:
1296 with get_user_db_session(username) as db_session:
1297 research = (
1298 db_session.query(ResearchHistory)
1299 .filter_by(id=research_id)
1300 .first()
1301 )
1302 if not research:
1303 return jsonify({"error": "Research not found"}), 404
1305 # Get report using storage abstraction
1306 from ...storage import get_report_storage
1308 # Get metadata for settings snapshot
1309 metadata = (
1310 research.research_meta if research.research_meta else {}
1311 )
1312 settings_snapshot = (
1313 metadata.get("settings_snapshot") if metadata else None
1314 )
1316 storage = get_report_storage(
1317 session=db_session, settings_snapshot=settings_snapshot
1318 )
1320 # Get report content directly (in memory)
1321 report_content = storage.get_report(research_id, username)
1322 if not report_content:
1323 return jsonify({"error": "Report content not found"}), 404
1325 # Export to requested format (all in memory)
1326 try:
1327 # Use title or query for the PDF title
1328 pdf_title = research.title or research.query
1330 # Generate export content in memory
1331 export_content, filename, mimetype = (
1332 export_report_to_memory(
1333 report_content, format, title=pdf_title
1334 )
1335 )
1337 # Send the file directly from memory
1338 return send_file(
1339 io.BytesIO(export_content),
1340 as_attachment=True,
1341 download_name=filename,
1342 mimetype=mimetype,
1343 )
1344 except Exception:
1345 logger.exception("Error exporting report")
1346 return jsonify(
1347 {
1348 "error": f"Failed to export to {format}. Please try again later."
1349 }
1350 ), 500
1352 except Exception:
1353 logger.exception("Error in export endpoint")
1354 return jsonify({"error": "An internal error has occurred"}), 500
1356 except Exception:
1357 logger.exception("Unexpected error in export endpoint")
1358 return jsonify({"error": "An internal error has occurred"}), 500
1361@research_bp.route("/api/research/<string:research_id>/status")
1362@limiter.exempt
1363@login_required
1364def get_research_status(research_id):
1365 """Get the status of a research process"""
1366 username = session.get("username")
1367 if not username: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true
1368 return jsonify({"error": "Not authenticated"}), 401
1370 try:
1371 with get_user_db_session(username) as db_session:
1372 research = (
1373 db_session.query(ResearchHistory)
1374 .filter_by(id=research_id)
1375 .first()
1376 )
1378 if research is None:
1379 return jsonify({"error": "Research not found"}), 404
1381 status = research.status
1382 progress = research.progress
1383 completed_at = research.completed_at
1384 report_path = research.report_path
1385 metadata = research.research_meta or {}
1387 # Extract and format error information for better UI display
1388 error_info = {}
1389 if metadata and "error" in metadata: 1389 ↛ 1451line 1389 didn't jump to line 1451 because the condition on line 1389 was always true
1390 error_msg = metadata["error"]
1391 error_type = "unknown"
1393 # Detect specific error types
1394 if "timeout" in error_msg.lower(): 1394 ↛ 1395line 1394 didn't jump to line 1395 because the condition on line 1394 was never true
1395 error_type = "timeout"
1396 error_info = {
1397 "type": "timeout",
1398 "message": "LLM service timed out during synthesis. This may be due to high server load or connectivity issues.",
1399 "suggestion": "Try again later or use a smaller query scope.",
1400 }
1401 elif ( 1401 ↛ 1405line 1401 didn't jump to line 1405 because the condition on line 1401 was never true
1402 "token limit" in error_msg.lower()
1403 or "context length" in error_msg.lower()
1404 ):
1405 error_type = "token_limit"
1406 error_info = {
1407 "type": "token_limit",
1408 "message": "The research query exceeded the AI model's token limit during synthesis.",
1409 "suggestion": "Try using a more specific query or reduce the research scope.",
1410 }
1411 elif ( 1411 ↛ 1415line 1411 didn't jump to line 1415 because the condition on line 1411 was never true
1412 "final answer synthesis fail" in error_msg.lower()
1413 or "llm error" in error_msg.lower()
1414 ):
1415 error_type = "llm_error"
1416 error_info = {
1417 "type": "llm_error",
1418 "message": "The AI model encountered an error during final answer synthesis.",
1419 "suggestion": "Check that your LLM service is running correctly or try a different model.",
1420 }
1421 elif "ollama" in error_msg.lower(): 1421 ↛ 1428line 1421 didn't jump to line 1428 because the condition on line 1421 was always true
1422 error_type = "ollama_error"
1423 error_info = {
1424 "type": "ollama_error",
1425 "message": "The Ollama service is not responding properly.",
1426 "suggestion": "Make sure Ollama is running with 'ollama serve' and the model is downloaded.",
1427 }
1428 elif "connection" in error_msg.lower():
1429 error_type = "connection"
1430 error_info = {
1431 "type": "connection",
1432 "message": "Connection error with the AI service.",
1433 "suggestion": "Check your internet connection and AI service status.",
1434 }
1435 elif metadata.get("solution"):
1436 # Use the solution provided in metadata if available
1437 error_info = {
1438 "type": error_type,
1439 "message": error_msg,
1440 "suggestion": metadata.get("solution"),
1441 }
1442 else:
1443 # Generic error with the original message
1444 error_info = {
1445 "type": error_type,
1446 "message": error_msg,
1447 "suggestion": "Try again with a different query or check the application logs.",
1448 }
1450 # Get the latest milestone log for this research
1451 latest_milestone = None
1452 try:
1453 milestone_log = (
1454 db_session.query(ResearchLog)
1455 .filter_by(research_id=research_id, level="MILESTONE")
1456 .order_by(ResearchLog.timestamp.desc())
1457 .first()
1458 )
1459 if milestone_log: 1459 ↛ 1460line 1459 didn't jump to line 1460 because the condition on line 1459 was never true
1460 latest_milestone = {
1461 "message": milestone_log.message,
1462 "time": milestone_log.timestamp.isoformat()
1463 if milestone_log.timestamp
1464 else None,
1465 "type": "MILESTONE",
1466 }
1467 logger.debug(
1468 f"Found latest milestone for research {research_id}: {milestone_log.message}"
1469 )
1470 else:
1471 logger.debug(
1472 f"No milestone logs found for research {research_id}"
1473 )
1474 except Exception as e:
1475 logger.warning(f"Error fetching latest milestone: {e!s}")
1477 filtered_metadata = strip_settings_snapshot(metadata)
1478 if error_info: 1478 ↛ 1481line 1478 didn't jump to line 1481 because the condition on line 1478 was always true
1479 filtered_metadata["error_info"] = error_info
1481 response_data = {
1482 "status": status,
1483 "progress": progress,
1484 "completed_at": completed_at,
1485 "report_path": report_path,
1486 "metadata": filtered_metadata,
1487 }
1489 # Include latest milestone as a log_entry for frontend compatibility
1490 if latest_milestone: 1490 ↛ 1491line 1490 didn't jump to line 1491 because the condition on line 1490 was never true
1491 response_data["log_entry"] = latest_milestone
1493 return jsonify(response_data)
1494 except Exception:
1495 logger.exception("Error getting research status")
1496 return jsonify({"error": "Error checking research status"}), 500
1499@research_bp.route("/api/queue/status", methods=["GET"])
1500@login_required
1501def get_queue_status():
1502 """Get the current queue status for the user"""
1503 username = session.get("username")
1505 from ..queue import QueueManager
1507 try:
1508 queue_items = QueueManager.get_user_queue(username)
1510 return jsonify(
1511 {
1512 "status": "success",
1513 "queue": queue_items,
1514 "total": len(queue_items),
1515 }
1516 )
1517 except Exception:
1518 logger.exception("Error getting queue status")
1519 return jsonify(
1520 {"status": "error", "message": "Failed to process request"}
1521 ), 500
1524@research_bp.route("/api/queue/<string:research_id>/position", methods=["GET"])
1525@login_required
1526def get_queue_position(research_id):
1527 """Get the queue position for a specific research"""
1528 username = session.get("username")
1530 from ..queue import QueueManager
1532 try:
1533 position = QueueManager.get_queue_position(username, research_id)
1535 if position is None:
1536 return jsonify(
1537 {"status": "error", "message": "Research not found in queue"}
1538 ), 404
1540 return jsonify({"status": "success", "position": position})
1541 except Exception:
1542 logger.exception("Error getting queue position")
1543 return jsonify(
1544 {"status": "error", "message": "Failed to process request"}
1545 ), 500
1548@research_bp.route("/api/config/limits", methods=["GET"])
1549def get_upload_limits():
1550 """
1551 Get file upload configuration limits.
1553 Returns the backend's authoritative limits for file uploads,
1554 allowing the frontend to stay in sync without hardcoding values.
1555 """
1556 return jsonify(
1557 {
1558 "max_file_size": FileUploadValidator.MAX_FILE_SIZE,
1559 "max_files": FileUploadValidator.MAX_FILES_PER_REQUEST,
1560 "allowed_mime_types": list(FileUploadValidator.ALLOWED_MIME_TYPES),
1561 }
1562 )
1565@research_bp.route("/api/upload/pdf", methods=["POST"])
1566@login_required
1567@upload_rate_limit
1568def upload_pdf():
1569 """
1570 Upload and extract text from PDF files with comprehensive security validation.
1572 Security features:
1573 - Rate limiting (10 uploads/min, 100/hour per user)
1574 - File size validation (50MB max per file)
1575 - File count validation (100 files max)
1576 - PDF structure validation
1577 - MIME type validation
1579 Performance improvements:
1580 - Single-pass PDF processing (text + metadata)
1581 - Optimized extraction service
1582 """
1583 username = session.get("username")
1584 if not username: 1584 ↛ 1585line 1584 didn't jump to line 1585 because the condition on line 1584 was never true
1585 return jsonify({"error": "Not authenticated"}), 401
1587 try:
1588 # Early request size validation (before reading any files)
1589 # This prevents memory exhaustion from chunked encoding attacks
1590 max_request_size = (
1591 FileUploadValidator.MAX_FILES_PER_REQUEST
1592 * FileUploadValidator.MAX_FILE_SIZE
1593 )
1594 if request.content_length and request.content_length > max_request_size: 1594 ↛ 1595line 1594 didn't jump to line 1595 because the condition on line 1594 was never true
1595 return jsonify(
1596 {
1597 "error": f"Request too large. Maximum size is {max_request_size // (1024 * 1024)}MB"
1598 }
1599 ), 413
1601 # Check if files are present in the request
1602 if "files" not in request.files:
1603 return jsonify({"error": "No files provided"}), 400
1605 files = request.files.getlist("files")
1606 if not files or files[0].filename == "":
1607 return jsonify({"error": "No files selected"}), 400
1609 # Validate file count
1610 is_valid, error_msg = FileUploadValidator.validate_file_count(
1611 len(files)
1612 )
1613 if not is_valid:
1614 return jsonify({"error": error_msg}), 400
1616 # Get PDF extraction service
1617 pdf_service = get_pdf_extraction_service()
1619 extracted_texts = []
1620 total_files = len(files)
1621 processed_files = 0
1622 errors = []
1624 for file in files:
1625 if not file or not file.filename: 1625 ↛ 1626line 1625 didn't jump to line 1626 because the condition on line 1625 was never true
1626 errors.append("Unnamed file: Skipped")
1627 continue
1629 try:
1630 # Read file content (with disk spooling, large files are read from temp file)
1631 pdf_content = file.read()
1633 # Comprehensive validation
1634 is_valid, error_msg = FileUploadValidator.validate_upload(
1635 filename=file.filename,
1636 file_content=pdf_content,
1637 content_length=file.content_length,
1638 )
1640 if not is_valid: 1640 ↛ 1645line 1640 didn't jump to line 1645 because the condition on line 1640 was always true
1641 errors.append(f"{file.filename}: {error_msg}")
1642 continue
1644 # Extract text and metadata in single pass (performance fix)
1645 result = pdf_service.extract_text_and_metadata(
1646 pdf_content, file.filename
1647 )
1649 if result["success"]:
1650 extracted_texts.append(
1651 {
1652 "filename": result["filename"],
1653 "text": result["text"],
1654 "size": result["size"],
1655 "pages": result["pages"],
1656 }
1657 )
1658 processed_files += 1
1659 else:
1660 errors.append(f"{file.filename}: {result['error']}")
1662 except Exception:
1663 logger.exception(f"Error processing {file.filename}")
1664 errors.append(f"{file.filename}: Error processing file")
1665 finally:
1666 # Close the file stream to release resources
1667 try:
1668 file.close()
1669 except Exception:
1670 pass
1672 # Prepare response
1673 response_data = {
1674 "status": "success",
1675 "processed_files": processed_files,
1676 "total_files": total_files,
1677 "extracted_texts": extracted_texts,
1678 "combined_text": "\n\n".join(
1679 [
1680 f"--- From {item['filename']} ---\n{item['text']}"
1681 for item in extracted_texts
1682 ]
1683 ),
1684 "errors": errors,
1685 }
1687 if processed_files == 0: 1687 ↛ 1696line 1687 didn't jump to line 1696 because the condition on line 1687 was always true
1688 return jsonify(
1689 {
1690 "status": "error",
1691 "message": "No files were processed successfully",
1692 "errors": errors,
1693 }
1694 ), 400
1696 return jsonify(response_data)
1698 except Exception:
1699 logger.exception("Error processing PDF upload")
1700 return jsonify({"error": "Failed to process PDF files"}), 500