Coverage for src/local_deep_research/constants.py: 100%
53 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Project-wide constants for Local Deep Research."""
3from enum import StrEnum
4from typing import Dict, List
6from .__version__ import __version__
8# Honest, identifying User-Agent for APIs that prefer/require identification
9# (e.g., academic APIs like arXiv, PubMed, OpenAlex)
10USER_AGENT = (
11 f"Local-Deep-Research/{__version__} "
12 "(Academic Research Tool; https://github.com/LearningCircuit/local-deep-research)"
13)
15# Browser-like User-Agent for sites that may block bot requests
16# Use sparingly and only when necessary
17BROWSER_USER_AGENT = (
18 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
19 "AppleWebKit/537.36 (KHTML, like Gecko) "
20 "Chrome/120.0.0.0 Safari/537.36"
21)
24# --- Research status values ---
25# Frontend helpers: src/local_deep_research/web/static/js/config/constants.js
26# Injected via: src/local_deep_research/web/app_factory.py (inject_frontend_constants)
27# Template: src/local_deep_research/web/templates/base.html
28# If you add/remove/rename a status here, the frontend picks it up automatically.
29class ResearchStatus(StrEnum):
30 """Status values for research records.
32 Uses StrEnum so values compare equal to plain strings,
33 e.g. ``ResearchStatus.COMPLETED == "completed"`` is True.
35 Lifecycle::
37 [*] ─┬─► QUEUED ─┬─► IN_PROGRESS ─┬─► COMPLETED
38 │ │ ├─► FAILED
39 │ └─► SUSPENDED └─► SUSPENDED
40 │ (concurrency limit) (terminated while queued)
41 │
42 └─► IN_PROGRESS (slots available, skips queue)
44 Notes:
45 - PENDING is declared as a model default but no creation path
46 actually sets it. All routes use QUEUED or IN_PROGRESS.
47 - ERROR is checked as a terminal state but never set by current
48 code. It predates FAILED and exists for backward compatibility
49 with older database records.
50 - CANCELLED is not used by the research workflow. It is used by
51 the benchmark subsystem (BenchmarkStatus, BenchmarkTaskStatus).
52 """
54 # --- Active lifecycle states ---
55 PENDING = "pending" # Model default; never set by any creation path
56 QUEUED = "queued" # Waiting for a worker slot
57 IN_PROGRESS = "in_progress" # Worker actively executing
59 # --- Terminal states ---
60 COMPLETED = "completed" # Finished successfully
61 SUSPENDED = "suspended" # User terminated the research
62 FAILED = "failed" # Unrecoverable error during execution
64 # --- Legacy / compatibility ---
65 ERROR = "error" # Never set; predates FAILED
66 CANCELLED = "cancelled" # Unused by research; for benchmarks
69# --- Research library file_path sentinel values ---
70FILE_PATH_METADATA_ONLY = "metadata_only"
71FILE_PATH_TEXT_ONLY = "text_only_not_stored"
72FILE_PATH_BLOB_DELETED = "blob_deleted"
73FILE_PATH_SENTINELS = (
74 FILE_PATH_METADATA_ONLY,
75 FILE_PATH_TEXT_ONLY,
76 FILE_PATH_BLOB_DELETED,
77)
79# --- Snippet / truncation lengths ---
80SNIPPET_LENGTH_SHORT = 250
81SNIPPET_LENGTH_LONG = 500
83# --- Research history collection ---
84RESEARCH_HISTORY_COLLECTION_NAME = "History"
85RESEARCH_HISTORY_COLLECTION_DESCRIPTION = (
86 "Your research history indexed for AI-powered semantic search. "
87 "Indexing converts past research reports and their sources into "
88 "searchable content, enabling natural-language queries across all "
89 "your previous research. Used by the History page search when in "
90 "AI or Hybrid mode."
91)
93# --- Available search strategies (UI-facing) ---
94# Single source of truth for strategies shown in all UI dropdowns.
95# create_strategy() in search_system_factory.py handles additional names
96# (aliases, internal strategies) — this list is purely for the UI.
97AVAILABLE_STRATEGIES: List[Dict[str, str]] = [
98 {
99 "name": "source-based",
100 "label": "Source-Based (Best for small <16,000 context window)",
101 "description": "Comprehensive research with inline citations. Focuses on finding and extracting information from authoritative sources.",
102 },
103 {
104 "name": "focused-iteration",
105 "label": "Focused Iteration - Quick (Minimal text output)",
106 "description": "Fast & precise Q&A with iterative search. Good for complex queries requiring specific answers.",
107 },
108 {
109 "name": "focused-iteration-standard",
110 "label": "Focused Iteration - Comprehensive (Needs >16,000 context window)",
111 "description": "Detailed long-form output with citations. Uses standard citation handler for comprehensive answers.",
112 },
113 {
114 "name": "mcp",
115 "label": "MCP ReAct (Agentic research - LLM decides tools)",
116 "description": "Agentic research using ReAct pattern. LLM decides what tools to call, analyzes results, and iterates.",
117 },
118 {
119 "name": "langgraph-agent",
120 "label": "LangGraph Agent (Autonomous agentic research)",
121 "description": "Agentic research where the LLM autonomously decides what to search, which engines to use, and when to synthesize. Supports all search engines as tools.",
122 },
123]
126ALL_STRATEGIES: List[Dict[str, str]] = [
127 *AVAILABLE_STRATEGIES,
128 {
129 "name": "iterative-refinement",
130 "label": "Iterative Refinement (Progressive refinement)",
131 "description": "LLM-guided progressive refinement. Iteratively refines results using evaluation and follow-up queries.",
132 },
133 {
134 "name": "topic-organization",
135 "label": "Topic Organization (Clusters by topic)",
136 "description": "Clusters sources into topics with lead texts. Organizes research by themes for structured output.",
137 },
138 {
139 "name": "news_aggregation",
140 "label": "News Aggregation (Current events)",
141 "description": "Specialized for news aggregation and current events.",
142 },
143 {
144 "name": "rapid",
145 "label": "Rapid (Quick single-pass search)",
146 "description": "Quick single-pass search for fast results. Good for simple queries.",
147 },
148 {
149 "name": "iterative",
150 "label": "Iterative (Loop-based reasoning)",
151 "description": "Loop-based reasoning with persistent knowledge accumulation and confidence tracking.",
152 },
153 {
154 "name": "parallel",
155 "label": "Parallel (Multiple queries simultaneously)",
156 "description": "Runs multiple search queries in parallel for comprehensive coverage.",
157 },
158 {
159 "name": "recursive",
160 "label": "Recursive (Query decomposition)",
161 "description": "Recursive decomposition of complex queries into simpler sub-queries.",
162 },
163 {
164 "name": "adaptive",
165 "label": "Adaptive (Step-by-step reasoning)",
166 "description": "Adaptive step-by-step reasoning that adjusts strategy based on results.",
167 },
168 {
169 "name": "smart",
170 "label": "Smart (Auto sub-query generation)",
171 "description": "Smart decomposition with automatic sub-query generation.",
172 },
173 {
174 "name": "standard",
175 "label": "Standard (Basic iterative search)",
176 "description": "Basic iterative search strategy for general use.",
177 },
178 {
179 "name": "iterdrag",
180 "label": "IterDRAG (Iterative retrieval and generation)",
181 "description": "IterDRAG strategy for iterative document retrieval and generation.",
182 },
183 {
184 "name": "iterative-reasoning",
185 "label": "Iterative Reasoning (Depth-based exploration)",
186 "description": "Iterative reasoning with depth-based exploration.",
187 },
188 {
189 "name": "browsecomp",
190 "label": "BrowseComp (Confidence-based iteration)",
191 "description": "BrowseComp optimized strategy with confidence-based iteration.",
192 },
193 {
194 "name": "evidence",
195 "label": "Evidence (Verification with candidate discovery)",
196 "description": "Enhanced evidence-based verification with candidate discovery and pattern learning.",
197 },
198 {
199 "name": "constrained",
200 "label": "Constrained (Progressive narrowing)",
201 "description": "Progressive constraint-based search that narrows candidates step by step.",
202 },
203 {
204 "name": "parallel-constrained",
205 "label": "Parallel Constrained (Combined constraint execution)",
206 "description": "Parallel constraint-based search with combined constraint execution.",
207 },
208 {
209 "name": "early-stop-constrained",
210 "label": "Early Stop Constrained (With early stopping at 99%)",
211 "description": "Parallel constraint search with immediate evaluation and early stopping.",
212 },
213 {
214 "name": "smart-query",
215 "label": "Smart Query (LLM query generation)",
216 "description": "Smart query generation strategy using LLM-generated queries.",
217 },
218 {
219 "name": "dual-confidence",
220 "label": "Dual Confidence (Positive/negative/uncertainty scoring)",
221 "description": "Dual confidence scoring with positive/negative/uncertainty.",
222 },
223 {
224 "name": "dual-confidence-with-rejection",
225 "label": "Dual Confidence + Rejection (Early rejection)",
226 "description": "Dual confidence with early rejection of poor candidates.",
227 },
228 {
229 "name": "concurrent-dual-confidence",
230 "label": "Concurrent Dual Confidence (Concurrent search & evaluation)",
231 "description": "Concurrent search and evaluation with progressive constraint relaxation.",
232 },
233 {
234 "name": "constraint-parallel",
235 "label": "Constraint Parallel (Parallel constraint checking)",
236 "description": "Parallel constraint checking with entity seeding and direct property search.",
237 },
238 {
239 "name": "modular",
240 "label": "Modular (Modular architecture with constraint checking)",
241 "description": "Modular architecture using constraint checking and candidate exploration modules.",
242 },
243 {
244 "name": "modular-parallel",
245 "label": "Modular Parallel (Modular with parallel exploration)",
246 "description": "Modular strategy with parallel exploration.",
247 },
248 {
249 "name": "browsecomp-entity",
250 "label": "BrowseComp Entity (Entity-focused with knowledge graph)",
251 "description": "Entity-focused search for BrowseComp questions with knowledge graph building.",
252 },
253]
256# --- Journal quality scoring thresholds ---
257# Used by journal_quality.scoring.derive_quality_score and
258# journal_quality.scoring.institution_score_from_h_index. Single source of
259# truth so the build phase, the runtime filter, and the dashboard agree on
260# what each h-index threshold means.
261#
262# Thresholds calibrated from real OpenAlex data:
263# - Nature h-index ≈ 1,442
264# - PLOS ONE h-index ≈ 467
265# - Only ~3 journals globally have h-index > 1,000
266# h-index has field-dependent bias (math vs biomed); these are general-purpose.
268# Journal h-index thresholds → quality scores
269JOURNAL_HINDEX_ELITE = 150 # Nature/Science/NEJM tier
270JOURNAL_HINDEX_STRONG = 75
271JOURNAL_HINDEX_VERY_GOOD = 40
272JOURNAL_HINDEX_GOOD = 20
273JOURNAL_HINDEX_ACCEPTABLE = 10
275# Journal quality scores (1–10 scale)
276JOURNAL_QUALITY_PREDATORY = 1
277JOURNAL_QUALITY_DEFAULT = 4
278JOURNAL_QUALITY_ACCEPTABLE = 5
279JOURNAL_QUALITY_GOOD = 6
280JOURNAL_QUALITY_VERY_GOOD = 7
281JOURNAL_QUALITY_STRONG = 8
282JOURNAL_QUALITY_ELITE = 10
284# The complete set of scores the scoring algorithm emits. Scores 2, 3, 9
285# are deliberately never produced by the tiered scoring logic; LLM outputs
286# outside this set are rejected as parse failures so prompt drift surfaces
287# via the existing failure counter rather than silently snapping.
288# INVARIANT: score 9 is intentionally NOT in this set. Tier 4 LLM prompts
289# never produce it and Tier 1-3 thresholds skip directly from 8 (h>=75)
290# to 10 (h>=150). Do not "add it for completeness" — downstream code in
291# search_utilities._format_quality_tag has a defensive branch for 9 that
292# is currently dead by design.
293VALID_QUALITY_SCORES = frozenset(
294 {
295 JOURNAL_QUALITY_PREDATORY,
296 JOURNAL_QUALITY_DEFAULT,
297 JOURNAL_QUALITY_ACCEPTABLE,
298 JOURNAL_QUALITY_GOOD,
299 JOURNAL_QUALITY_VERY_GOOD,
300 JOURNAL_QUALITY_STRONG,
301 JOURNAL_QUALITY_ELITE,
302 }
303)
305# DOAJ scoring (DOAJ Seal = top ~10% of DOAJ journals, best OA practices)
306DOAJ_QUALITY_WITH_SEAL = 8
307DOAJ_QUALITY_NO_SEAL = 5
308CONFERENCE_QUALITY_DEFAULT = (
309 5 # Neutral; in CS top conferences are Q1-equivalent
310)
311# Preprint repositories (arXiv, bioRxiv, SSRN, PsyArXiv, ...) are not
312# peer-reviewed — the venue itself carries no quality signal. Cap all
313# repository-type sources at this score regardless of their h-index,
314# which is inflated by aggregating thousands of highly-cited papers
315# (arXiv has h=674 because of its authors, not because of venue rigor).
316# Matches the conference default: "acceptable, but the venue doesn't
317# vouch for the paper". The filter's Tier 3.5 institution salvage can
318# lift this to 6 when the authors are at a strong institution.
319REPOSITORY_QUALITY_DEFAULT = 5
321# Predatory whitelist override threshold. A flagged journal is rescued
322# if it's in DOAJ (evidence-based) OR has h-index strictly greater than
323# this value (heuristic — `>`, not `>=`).
324#
325# Do not re-tune without literature support. The h-index is an impact
326# metric, not an integrity signal. Reviews of predatory-vs-legitimate
327# classification (Blacklists and Whitelists to Tackle Predatory
328# Publishing, mBio 2019, and the PMC2020 review that followed) treat
329# DOAJ indexing + COPE / OASPA membership as the evidence-based
330# whitelist — NOT any specific h-index boundary. The value 10 and
331# strict-> here are pragmatic defaults; tuning them only changes
332# behavior at the boundary and has no published basis. If you want
333# real improvement, ADD more signals (JCR listing, OASPA membership)
334# rather than tweaking this number. (Investigated in PR #3081, 2026-04.)
335PREDATORY_WHITELIST_HINDEX = 10
337# Institution h-index thresholds → quality scores. Capped at
338# INSTITUTION_QUALITY_TOP — institution salvage scoring never beats a real
339# venue match.
340INSTITUTION_HINDEX_TOP = 250 # Top-tier research universities
341INSTITUTION_HINDEX_HIGH = 50
342INSTITUTION_QUALITY_TOP = 6
343INSTITUTION_QUALITY_HIGH = 5
344INSTITUTION_QUALITY_DEFAULT = 4
347# --- API timeouts ---
348# OpenAlex DOI→source_id batch enrichment. Distinct from the OpenAlex search
349# engine timeout (which uses the safe_requests default of 30s) because batch
350# metadata lookups are lightweight and we'd rather fail fast than block the
351# pre-enrichment layer.
352OPENALEX_ENRICHMENT_API_TIMEOUT = 15
355# --- Journal-quality dataset download ---
356# Minimum free disk space required before starting a bulk download. The
357# five sources uncompress to ~1 GB total intermediate working set; the 2
358# GB floor gives headroom for the atomic temp file + compiled DB while
359# leaving room for the user's other work.
360JOURNAL_QUALITY_MIN_FREE_DISK_BYTES = 2 * 1024**3
363def get_available_strategies(show_all: bool = False) -> List[Dict[str, str]]:
364 """Get the list of available research strategies.
366 Args:
367 show_all: If True, return all strategies including advanced/experimental ones.
369 Returns:
370 List of dictionaries with 'name', 'label', and 'description' keys.
371 """
372 if show_all:
373 return ALL_STRATEGIES.copy()
374 return AVAILABLE_STRATEGIES.copy()