Coverage for src/local_deep_research/constants.py: 100%

53 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Project-wide constants for Local Deep Research.""" 

2 

3from enum import StrEnum 

4from typing import Dict, List 

5 

6from .__version__ import __version__ 

7 

8# Honest, identifying User-Agent for APIs that prefer/require identification 

9# (e.g., academic APIs like arXiv, PubMed, OpenAlex) 

10USER_AGENT = ( 

11 f"Local-Deep-Research/{__version__} " 

12 "(Academic Research Tool; https://github.com/LearningCircuit/local-deep-research)" 

13) 

14 

15# Browser-like User-Agent for sites that may block bot requests 

16# Use sparingly and only when necessary 

17BROWSER_USER_AGENT = ( 

18 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " 

19 "AppleWebKit/537.36 (KHTML, like Gecko) " 

20 "Chrome/120.0.0.0 Safari/537.36" 

21) 

22 

23 

24# --- Research status values --- 

25# Frontend helpers: src/local_deep_research/web/static/js/config/constants.js 

26# Injected via: src/local_deep_research/web/app_factory.py (inject_frontend_constants) 

27# Template: src/local_deep_research/web/templates/base.html 

28# If you add/remove/rename a status here, the frontend picks it up automatically. 

29class ResearchStatus(StrEnum): 

30 """Status values for research records. 

31 

32 Uses StrEnum so values compare equal to plain strings, 

33 e.g. ``ResearchStatus.COMPLETED == "completed"`` is True. 

34 

35 Lifecycle:: 

36 

37 [*] ─┬─► QUEUED ─┬─► IN_PROGRESS ─┬─► COMPLETED 

38 │ │ ├─► FAILED 

39 │ └─► SUSPENDED └─► SUSPENDED 

40 │ (concurrency limit) (terminated while queued) 

41 

42 └─► IN_PROGRESS (slots available, skips queue) 

43 

44 Notes: 

45 - PENDING is declared as a model default but no creation path 

46 actually sets it. All routes use QUEUED or IN_PROGRESS. 

47 - ERROR is checked as a terminal state but never set by current 

48 code. It predates FAILED and exists for backward compatibility 

49 with older database records. 

50 - CANCELLED is not used by the research workflow. It is used by 

51 the benchmark subsystem (BenchmarkStatus, BenchmarkTaskStatus). 

52 """ 

53 

54 # --- Active lifecycle states --- 

55 PENDING = "pending" # Model default; never set by any creation path 

56 QUEUED = "queued" # Waiting for a worker slot 

57 IN_PROGRESS = "in_progress" # Worker actively executing 

58 

59 # --- Terminal states --- 

60 COMPLETED = "completed" # Finished successfully 

61 SUSPENDED = "suspended" # User terminated the research 

62 FAILED = "failed" # Unrecoverable error during execution 

63 

64 # --- Legacy / compatibility --- 

65 ERROR = "error" # Never set; predates FAILED 

66 CANCELLED = "cancelled" # Unused by research; for benchmarks 

67 

68 

69# --- Research library file_path sentinel values --- 

70FILE_PATH_METADATA_ONLY = "metadata_only" 

71FILE_PATH_TEXT_ONLY = "text_only_not_stored" 

72FILE_PATH_BLOB_DELETED = "blob_deleted" 

73FILE_PATH_SENTINELS = ( 

74 FILE_PATH_METADATA_ONLY, 

75 FILE_PATH_TEXT_ONLY, 

76 FILE_PATH_BLOB_DELETED, 

77) 

78 

79# --- Snippet / truncation lengths --- 

80SNIPPET_LENGTH_SHORT = 250 

81SNIPPET_LENGTH_LONG = 500 

82 

83# --- Research history collection --- 

84RESEARCH_HISTORY_COLLECTION_NAME = "History" 

85RESEARCH_HISTORY_COLLECTION_DESCRIPTION = ( 

86 "Your research history indexed for AI-powered semantic search. " 

87 "Indexing converts past research reports and their sources into " 

88 "searchable content, enabling natural-language queries across all " 

89 "your previous research. Used by the History page search when in " 

90 "AI or Hybrid mode." 

91) 

92 

93# --- Available search strategies (UI-facing) --- 

94# Single source of truth for strategies shown in all UI dropdowns. 

95# create_strategy() in search_system_factory.py handles additional names 

96# (aliases, internal strategies) — this list is purely for the UI. 

97AVAILABLE_STRATEGIES: List[Dict[str, str]] = [ 

98 { 

99 "name": "source-based", 

100 "label": "Source-Based (Best for small <16,000 context window)", 

101 "description": "Comprehensive research with inline citations. Focuses on finding and extracting information from authoritative sources.", 

102 }, 

103 { 

104 "name": "focused-iteration", 

105 "label": "Focused Iteration - Quick (Minimal text output)", 

106 "description": "Fast & precise Q&A with iterative search. Good for complex queries requiring specific answers.", 

107 }, 

108 { 

109 "name": "focused-iteration-standard", 

110 "label": "Focused Iteration - Comprehensive (Needs >16,000 context window)", 

111 "description": "Detailed long-form output with citations. Uses standard citation handler for comprehensive answers.", 

112 }, 

113 { 

114 "name": "mcp", 

115 "label": "MCP ReAct (Agentic research - LLM decides tools)", 

116 "description": "Agentic research using ReAct pattern. LLM decides what tools to call, analyzes results, and iterates.", 

117 }, 

118 { 

119 "name": "langgraph-agent", 

120 "label": "LangGraph Agent (Autonomous agentic research)", 

121 "description": "Agentic research where the LLM autonomously decides what to search, which engines to use, and when to synthesize. Supports all search engines as tools.", 

122 }, 

123] 

124 

125 

126ALL_STRATEGIES: List[Dict[str, str]] = [ 

127 *AVAILABLE_STRATEGIES, 

128 { 

129 "name": "iterative-refinement", 

130 "label": "Iterative Refinement (Progressive refinement)", 

131 "description": "LLM-guided progressive refinement. Iteratively refines results using evaluation and follow-up queries.", 

132 }, 

133 { 

134 "name": "topic-organization", 

135 "label": "Topic Organization (Clusters by topic)", 

136 "description": "Clusters sources into topics with lead texts. Organizes research by themes for structured output.", 

137 }, 

138 { 

139 "name": "news_aggregation", 

140 "label": "News Aggregation (Current events)", 

141 "description": "Specialized for news aggregation and current events.", 

142 }, 

143 { 

144 "name": "rapid", 

145 "label": "Rapid (Quick single-pass search)", 

146 "description": "Quick single-pass search for fast results. Good for simple queries.", 

147 }, 

148 { 

149 "name": "iterative", 

150 "label": "Iterative (Loop-based reasoning)", 

151 "description": "Loop-based reasoning with persistent knowledge accumulation and confidence tracking.", 

152 }, 

153 { 

154 "name": "parallel", 

155 "label": "Parallel (Multiple queries simultaneously)", 

156 "description": "Runs multiple search queries in parallel for comprehensive coverage.", 

157 }, 

158 { 

159 "name": "recursive", 

160 "label": "Recursive (Query decomposition)", 

161 "description": "Recursive decomposition of complex queries into simpler sub-queries.", 

162 }, 

163 { 

164 "name": "adaptive", 

165 "label": "Adaptive (Step-by-step reasoning)", 

166 "description": "Adaptive step-by-step reasoning that adjusts strategy based on results.", 

167 }, 

168 { 

169 "name": "smart", 

170 "label": "Smart (Auto sub-query generation)", 

171 "description": "Smart decomposition with automatic sub-query generation.", 

172 }, 

173 { 

174 "name": "standard", 

175 "label": "Standard (Basic iterative search)", 

176 "description": "Basic iterative search strategy for general use.", 

177 }, 

178 { 

179 "name": "iterdrag", 

180 "label": "IterDRAG (Iterative retrieval and generation)", 

181 "description": "IterDRAG strategy for iterative document retrieval and generation.", 

182 }, 

183 { 

184 "name": "iterative-reasoning", 

185 "label": "Iterative Reasoning (Depth-based exploration)", 

186 "description": "Iterative reasoning with depth-based exploration.", 

187 }, 

188 { 

189 "name": "browsecomp", 

190 "label": "BrowseComp (Confidence-based iteration)", 

191 "description": "BrowseComp optimized strategy with confidence-based iteration.", 

192 }, 

193 { 

194 "name": "evidence", 

195 "label": "Evidence (Verification with candidate discovery)", 

196 "description": "Enhanced evidence-based verification with candidate discovery and pattern learning.", 

197 }, 

198 { 

199 "name": "constrained", 

200 "label": "Constrained (Progressive narrowing)", 

201 "description": "Progressive constraint-based search that narrows candidates step by step.", 

202 }, 

203 { 

204 "name": "parallel-constrained", 

205 "label": "Parallel Constrained (Combined constraint execution)", 

206 "description": "Parallel constraint-based search with combined constraint execution.", 

207 }, 

208 { 

209 "name": "early-stop-constrained", 

210 "label": "Early Stop Constrained (With early stopping at 99%)", 

211 "description": "Parallel constraint search with immediate evaluation and early stopping.", 

212 }, 

213 { 

214 "name": "smart-query", 

215 "label": "Smart Query (LLM query generation)", 

216 "description": "Smart query generation strategy using LLM-generated queries.", 

217 }, 

218 { 

219 "name": "dual-confidence", 

220 "label": "Dual Confidence (Positive/negative/uncertainty scoring)", 

221 "description": "Dual confidence scoring with positive/negative/uncertainty.", 

222 }, 

223 { 

224 "name": "dual-confidence-with-rejection", 

225 "label": "Dual Confidence + Rejection (Early rejection)", 

226 "description": "Dual confidence with early rejection of poor candidates.", 

227 }, 

228 { 

229 "name": "concurrent-dual-confidence", 

230 "label": "Concurrent Dual Confidence (Concurrent search & evaluation)", 

231 "description": "Concurrent search and evaluation with progressive constraint relaxation.", 

232 }, 

233 { 

234 "name": "constraint-parallel", 

235 "label": "Constraint Parallel (Parallel constraint checking)", 

236 "description": "Parallel constraint checking with entity seeding and direct property search.", 

237 }, 

238 { 

239 "name": "modular", 

240 "label": "Modular (Modular architecture with constraint checking)", 

241 "description": "Modular architecture using constraint checking and candidate exploration modules.", 

242 }, 

243 { 

244 "name": "modular-parallel", 

245 "label": "Modular Parallel (Modular with parallel exploration)", 

246 "description": "Modular strategy with parallel exploration.", 

247 }, 

248 { 

249 "name": "browsecomp-entity", 

250 "label": "BrowseComp Entity (Entity-focused with knowledge graph)", 

251 "description": "Entity-focused search for BrowseComp questions with knowledge graph building.", 

252 }, 

253] 

254 

255 

256# --- Journal quality scoring thresholds --- 

257# Used by journal_quality.scoring.derive_quality_score and 

258# journal_quality.scoring.institution_score_from_h_index. Single source of 

259# truth so the build phase, the runtime filter, and the dashboard agree on 

260# what each h-index threshold means. 

261# 

262# Thresholds calibrated from real OpenAlex data: 

263# - Nature h-index ≈ 1,442 

264# - PLOS ONE h-index ≈ 467 

265# - Only ~3 journals globally have h-index > 1,000 

266# h-index has field-dependent bias (math vs biomed); these are general-purpose. 

267 

268# Journal h-index thresholds → quality scores 

269JOURNAL_HINDEX_ELITE = 150 # Nature/Science/NEJM tier 

270JOURNAL_HINDEX_STRONG = 75 

271JOURNAL_HINDEX_VERY_GOOD = 40 

272JOURNAL_HINDEX_GOOD = 20 

273JOURNAL_HINDEX_ACCEPTABLE = 10 

274 

275# Journal quality scores (1–10 scale) 

276JOURNAL_QUALITY_PREDATORY = 1 

277JOURNAL_QUALITY_DEFAULT = 4 

278JOURNAL_QUALITY_ACCEPTABLE = 5 

279JOURNAL_QUALITY_GOOD = 6 

280JOURNAL_QUALITY_VERY_GOOD = 7 

281JOURNAL_QUALITY_STRONG = 8 

282JOURNAL_QUALITY_ELITE = 10 

283 

284# The complete set of scores the scoring algorithm emits. Scores 2, 3, 9 

285# are deliberately never produced by the tiered scoring logic; LLM outputs 

286# outside this set are rejected as parse failures so prompt drift surfaces 

287# via the existing failure counter rather than silently snapping. 

288# INVARIANT: score 9 is intentionally NOT in this set. Tier 4 LLM prompts 

289# never produce it and Tier 1-3 thresholds skip directly from 8 (h>=75) 

290# to 10 (h>=150). Do not "add it for completeness" — downstream code in 

291# search_utilities._format_quality_tag has a defensive branch for 9 that 

292# is currently dead by design. 

293VALID_QUALITY_SCORES = frozenset( 

294 { 

295 JOURNAL_QUALITY_PREDATORY, 

296 JOURNAL_QUALITY_DEFAULT, 

297 JOURNAL_QUALITY_ACCEPTABLE, 

298 JOURNAL_QUALITY_GOOD, 

299 JOURNAL_QUALITY_VERY_GOOD, 

300 JOURNAL_QUALITY_STRONG, 

301 JOURNAL_QUALITY_ELITE, 

302 } 

303) 

304 

305# DOAJ scoring (DOAJ Seal = top ~10% of DOAJ journals, best OA practices) 

306DOAJ_QUALITY_WITH_SEAL = 8 

307DOAJ_QUALITY_NO_SEAL = 5 

308CONFERENCE_QUALITY_DEFAULT = ( 

309 5 # Neutral; in CS top conferences are Q1-equivalent 

310) 

311# Preprint repositories (arXiv, bioRxiv, SSRN, PsyArXiv, ...) are not 

312# peer-reviewed — the venue itself carries no quality signal. Cap all 

313# repository-type sources at this score regardless of their h-index, 

314# which is inflated by aggregating thousands of highly-cited papers 

315# (arXiv has h=674 because of its authors, not because of venue rigor). 

316# Matches the conference default: "acceptable, but the venue doesn't 

317# vouch for the paper". The filter's Tier 3.5 institution salvage can 

318# lift this to 6 when the authors are at a strong institution. 

319REPOSITORY_QUALITY_DEFAULT = 5 

320 

321# Predatory whitelist override threshold. A flagged journal is rescued 

322# if it's in DOAJ (evidence-based) OR has h-index strictly greater than 

323# this value (heuristic — `>`, not `>=`). 

324# 

325# Do not re-tune without literature support. The h-index is an impact 

326# metric, not an integrity signal. Reviews of predatory-vs-legitimate 

327# classification (Blacklists and Whitelists to Tackle Predatory 

328# Publishing, mBio 2019, and the PMC2020 review that followed) treat 

329# DOAJ indexing + COPE / OASPA membership as the evidence-based 

330# whitelist — NOT any specific h-index boundary. The value 10 and 

331# strict-> here are pragmatic defaults; tuning them only changes 

332# behavior at the boundary and has no published basis. If you want 

333# real improvement, ADD more signals (JCR listing, OASPA membership) 

334# rather than tweaking this number. (Investigated in PR #3081, 2026-04.) 

335PREDATORY_WHITELIST_HINDEX = 10 

336 

337# Institution h-index thresholds → quality scores. Capped at 

338# INSTITUTION_QUALITY_TOP — institution salvage scoring never beats a real 

339# venue match. 

340INSTITUTION_HINDEX_TOP = 250 # Top-tier research universities 

341INSTITUTION_HINDEX_HIGH = 50 

342INSTITUTION_QUALITY_TOP = 6 

343INSTITUTION_QUALITY_HIGH = 5 

344INSTITUTION_QUALITY_DEFAULT = 4 

345 

346 

347# --- API timeouts --- 

348# OpenAlex DOI→source_id batch enrichment. Distinct from the OpenAlex search 

349# engine timeout (which uses the safe_requests default of 30s) because batch 

350# metadata lookups are lightweight and we'd rather fail fast than block the 

351# pre-enrichment layer. 

352OPENALEX_ENRICHMENT_API_TIMEOUT = 15 

353 

354 

355# --- Journal-quality dataset download --- 

356# Minimum free disk space required before starting a bulk download. The 

357# five sources uncompress to ~1 GB total intermediate working set; the 2 

358# GB floor gives headroom for the atomic temp file + compiled DB while 

359# leaving room for the user's other work. 

360JOURNAL_QUALITY_MIN_FREE_DISK_BYTES = 2 * 1024**3 

361 

362 

363def get_available_strategies(show_all: bool = False) -> List[Dict[str, str]]: 

364 """Get the list of available research strategies. 

365 

366 Args: 

367 show_all: If True, return all strategies including advanced/experimental ones. 

368 

369 Returns: 

370 List of dictionaries with 'name', 'label', and 'description' keys. 

371 """ 

372 if show_all: 

373 return ALL_STRATEGIES.copy() 

374 return AVAILABLE_STRATEGIES.copy()