Coverage for src/local_deep_research/constants.py: 100%

1"""Project-wide constants for Local Deep Research."""

3from enum import StrEnum

4from typing import Dict, List

6from .__version__ import __version__

8# Honest, identifying User-Agent for APIs that prefer/require identification

9# (e.g., academic APIs like arXiv, PubMed, OpenAlex)

10USER_AGENT = (

11 f"Local-Deep-Research/{__version__} "

12 "(Academic Research Tool; https://github.com/LearningCircuit/local-deep-research)"

13)

15# Browser-like User-Agent for sites that may block bot requests

16# Use sparingly and only when necessary

17BROWSER_USER_AGENT = (

18 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "

19 "AppleWebKit/537.36 (KHTML, like Gecko) "

20 "Chrome/120.0.0.0 Safari/537.36"

21)

24# Code-side single source of truth for the default search engine, used by

25# every reader that needs a fallback for a MISSING ``search.tool`` setting

26# (partial snapshots from the programmatic API, un-bootstrapped settings

27# DBs). Import THIS instead of hardcoding the string literal: scattered

28# literals are how the old ``"auto"`` default lingered across ~30 sites and

29# had to be hunted down one by one when the meta engines were removed.

30# Must match the registered default in defaults/default_settings.json —

31# pinned by tests/test_constants.py::test_default_search_tool_matches_registry.

32# Mirrors the DEFAULT_EGRESS_SCOPE pattern in security/egress/policy.py.

33DEFAULT_SEARCH_TOOL: str = "searxng"

36# --- Research status values ---

37# Frontend helpers: src/local_deep_research/web/static/js/config/constants.js

38# Injected via: src/local_deep_research/web/app_factory.py (inject_frontend_constants)

39# Template: src/local_deep_research/web/templates/base.html

40# If you add/remove/rename a status here, the frontend picks it up automatically.

41class ResearchStatus(StrEnum):

42 """Status values for research records.

44 Uses StrEnum so values compare equal to plain strings,

45 e.g. ``ResearchStatus.COMPLETED == "completed"`` is True.

47 Lifecycle::

49 [*] ─┬─► QUEUED ─┬─► IN_PROGRESS ─┬─► COMPLETED

50 │ │ ├─► FAILED

51 │ └─► SUSPENDED └─► SUSPENDED

52 │ (concurrency limit) (terminated while queued)

53 │

54 └─► IN_PROGRESS (slots available, skips queue)

56 Notes:

57 - PENDING is declared as a model default but no creation path

58 actually sets it. All routes use QUEUED or IN_PROGRESS.

59 - ERROR is checked as a terminal state but never set by current

60 code. It predates FAILED and exists for backward compatibility

61 with older database records.

62 - CANCELLED is not used by the research workflow. It is used by

63 the benchmark subsystem (BenchmarkStatus, BenchmarkTaskStatus).

64 """

66 # --- Active lifecycle states ---

67 PENDING = "pending" # Model default; never set by any creation path

68 QUEUED = "queued" # Waiting for a worker slot

69 IN_PROGRESS = "in_progress" # Worker actively executing

71 # --- Terminal states ---

72 COMPLETED = "completed" # Finished successfully

73 SUSPENDED = "suspended" # User terminated the research

74 FAILED = "failed" # Unrecoverable error during execution

76 # --- Legacy / compatibility ---

77 ERROR = "error" # Never set; predates FAILED

78 CANCELLED = "cancelled" # Unused by research; for benchmarks

81# --- Research library file_path sentinel values ---

82FILE_PATH_METADATA_ONLY = "metadata_only"

83FILE_PATH_TEXT_ONLY = "text_only_not_stored"

84FILE_PATH_BLOB_DELETED = "blob_deleted"

85FILE_PATH_SENTINELS = (

86 FILE_PATH_METADATA_ONLY,

87 FILE_PATH_TEXT_ONLY,

88 FILE_PATH_BLOB_DELETED,

89)

91# --- Default RAG / Local Search text separators ---

92DEFAULT_LOCAL_SEARCH_TEXT_SEPARATORS: List[str] = ["\n\n", "\n", ". ", " ", ""]

93DEFAULT_LOCAL_SEARCH_TEXT_SEPARATORS_JSON = '["\\n\\n", "\\n", ". ", " ", ""]'

96# --- Snippet / truncation lengths ---

97SNIPPET_LENGTH_SHORT = 250

98SNIPPET_LENGTH_LONG = 500

100# --- /history/logs/<id> pagination caps ---

101# Default matches the frontend logpanel DOM cap (MAX_LOG_ENTRIES); the

102# hard cap is the ceiling the route clamps to so a client cannot force

103# an unbounded load. Shared with the frontend via inject_frontend_constants

104# (see web/app_factory.py) → window.LDR_LOG_LIMITS (see base.html).

105HISTORY_LOGS_DEFAULT_LIMIT = 500

106HISTORY_LOGS_HARD_CAP = 5000

107

108# --- Research history collection ---

109RESEARCH_HISTORY_COLLECTION_NAME = "History"

110RESEARCH_HISTORY_COLLECTION_DESCRIPTION = (

111 "Your research history indexed for AI-powered semantic search. "

112 "Indexing converts past research reports and their sources into "

113 "searchable content, enabling natural-language queries across all "

114 "your previous research. Used by the History page search when in "

115 "AI or Hybrid mode."

116)

117

118# --- Available search strategies (UI-facing) ---

119# Single source of truth for strategies shown in all UI dropdowns.

120# create_strategy() in search_system_factory.py handles additional names

121# (aliases, internal strategies such as "news_aggregation") — this list is

122# purely for the UI.

123AVAILABLE_STRATEGIES: List[Dict[str, str]] = [

124 {

125 "name": "source-based",

126 "label": "Source-Based (Best for small <16,000 context window)",

127 "description": "Comprehensive research with inline citations. Focuses on finding and extracting information from authoritative sources.",

128 },

129 {

130 "name": "focused-iteration",

131 "label": "Focused Iteration - Quick (Minimal text output)",

132 "description": "Fast & precise Q&A with iterative search. Good for complex queries requiring specific answers.",

133 },

134 {

135 "name": "focused-iteration-standard",

136 "label": "Focused Iteration - Comprehensive (Needs >16,000 context window)",

137 "description": "Detailed long-form output with citations. Uses standard citation handler for comprehensive answers.",

138 },

139 {

140 "name": "topic-organization",

141 "label": "Topic Organization (Clusters by topic)",

142 "description": "Clusters sources into topics with lead texts. Organizes research by themes for structured output.",

143 },

144 {

145 "name": "langgraph-agent",

146 "label": "LangGraph Agent (Autonomous agentic research)",

147 "description": "Agentic research where the LLM autonomously decides what to search, which engines to use, and when to synthesize. Supports all search engines as tools.",

148 },

149]

150

151

152# --- Journal quality scoring thresholds ---

153# Used by journal_quality.scoring.derive_quality_score and

154# journal_quality.scoring.institution_score_from_h_index. Single source of

155# truth so the build phase, the runtime filter, and the dashboard agree on

156# what each h-index threshold means.

157#

158# Thresholds calibrated from real OpenAlex data:

159# - Nature h-index ≈ 1,442

160# - PLOS ONE h-index ≈ 467

161# - Only ~3 journals globally have h-index > 1,000

162# h-index has field-dependent bias (math vs biomed); these are general-purpose.

163

164# Journal h-index thresholds → quality scores

165JOURNAL_HINDEX_ELITE = 150 # Nature/Science/NEJM tier

166JOURNAL_HINDEX_STRONG = 75

167JOURNAL_HINDEX_VERY_GOOD = 40

168JOURNAL_HINDEX_GOOD = 20

169JOURNAL_HINDEX_ACCEPTABLE = 10

170

171# Journal quality scores (1–10 scale)

172JOURNAL_QUALITY_PREDATORY = 1

173JOURNAL_QUALITY_DEFAULT = 4

174JOURNAL_QUALITY_ACCEPTABLE = 5

175JOURNAL_QUALITY_GOOD = 6

176JOURNAL_QUALITY_VERY_GOOD = 7

177JOURNAL_QUALITY_STRONG = 8

178JOURNAL_QUALITY_ELITE = 10

179

180# The complete set of scores the scoring algorithm emits. Scores 2, 3, 9

181# are deliberately never produced by the tiered scoring logic; LLM outputs

182# outside this set are rejected as parse failures so prompt drift surfaces

183# via the existing failure counter rather than silently snapping.

184# INVARIANT: score 9 is intentionally NOT in this set. Tier 4 LLM prompts

185# never produce it and Tier 1-3 thresholds skip directly from 8 (h>=75)

186# to 10 (h>=150). Do not "add it for completeness" — downstream code in

187# search_utilities._format_quality_tag has a defensive branch for 9 that

188# is currently dead by design.

189VALID_QUALITY_SCORES = frozenset(

190 {

191 JOURNAL_QUALITY_PREDATORY,

192 JOURNAL_QUALITY_DEFAULT,

193 JOURNAL_QUALITY_ACCEPTABLE,

194 JOURNAL_QUALITY_GOOD,

195 JOURNAL_QUALITY_VERY_GOOD,

196 JOURNAL_QUALITY_STRONG,

197 JOURNAL_QUALITY_ELITE,

198 }

199)

200

201# DOAJ scoring. There used to be a higher DOAJ_QUALITY_WITH_SEAL = 8

202# tier, but DOAJ retired the Seal in April 2025 and removed it from

203# their metadata, so listing is now the only DOAJ signal:

204# https://blog.doaj.org/2025/04/09/our-metadata-changes-are-live-and-the-seal-has-been-retired/

205DOAJ_QUALITY_LISTED = 5

206CONFERENCE_QUALITY_DEFAULT = (

207 5 # Neutral; in CS top conferences are Q1-equivalent

208)

209# Preprint repositories (arXiv, bioRxiv, SSRN, PsyArXiv, ...) are not

210# peer-reviewed — the venue itself carries no quality signal. Cap all

211# repository-type sources at this score regardless of their h-index,

212# which is inflated by aggregating thousands of highly-cited papers

213# (arXiv has h=674 because of its authors, not because of venue rigor).

214# Matches the conference default: "acceptable, but the venue doesn't

215# vouch for the paper". The filter's Tier 3.5 institution salvage can

216# lift this to 6 when the authors are at a strong institution.

217REPOSITORY_QUALITY_DEFAULT = 5

218

219# Predatory whitelist override threshold. A flagged journal is rescued

220# if it's in DOAJ (evidence-based) OR has h-index strictly greater than

221# this value (heuristic — `>`, not `>=`).

222#

223# Do not re-tune without literature support. The h-index is an impact

224# metric, not an integrity signal. Reviews of predatory-vs-legitimate

225# classification (Blacklists and Whitelists to Tackle Predatory

226# Publishing, mBio 2019, and the PMC2020 review that followed) treat

227# DOAJ indexing + COPE / OASPA membership as the evidence-based

228# whitelist — NOT any specific h-index boundary. The value 10 and

229# strict-> here are pragmatic defaults; tuning them only changes

230# behavior at the boundary and has no published basis. If you want

231# real improvement, ADD more signals (JCR listing, OASPA membership)

232# rather than tweaking this number. (Investigated in PR #3081, 2026-04.)

233PREDATORY_WHITELIST_HINDEX = 10

234

235# Institution h-index thresholds → quality scores. Capped at

236# INSTITUTION_QUALITY_TOP — institution salvage scoring never beats a real

237# venue match.

238INSTITUTION_HINDEX_TOP = 250 # Top-tier research universities

239INSTITUTION_HINDEX_HIGH = 50

240INSTITUTION_QUALITY_TOP = 6

241INSTITUTION_QUALITY_HIGH = 5

242INSTITUTION_QUALITY_DEFAULT = 4

243

244

245# --- API timeouts ---

246# OpenAlex DOI→source_id batch enrichment. Distinct from the OpenAlex search

247# engine timeout (which uses the safe_requests default of 30s) because batch

248# metadata lookups are lightweight and we'd rather fail fast than block the

249# pre-enrichment layer.

250OPENALEX_ENRICHMENT_API_TIMEOUT = 15

251

252

253# --- Journal-quality dataset download ---

254# Minimum free disk space required before starting a bulk download. The

255# five sources uncompress to ~1 GB total intermediate working set; the 2

256# GB floor gives headroom for the atomic temp file + compiled DB while

257# leaving room for the user's other work.

258JOURNAL_QUALITY_MIN_FREE_DISK_BYTES = 2 * 1024**3

259

260

261def get_available_strategies() -> List[Dict[str, str]]:

262 """Get the list of available research strategies shown in the UI.

263

264 Returns:

265 List of dictionaries with 'name', 'label', and 'description' keys.

266 """

267 return AVAILABLE_STRATEGIES.copy()