Coverage for src/local_deep_research/web/services/report_assembly_service.py: 95%
84 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Reconstruct the legacy "full report" view from structured storage.
3`research.report_content` stores only the synthesized answer (with inline
4`[N](url)` hyperlinks). Sources live in the `research_resources` table;
5metrics live in `research.research_meta`. This module rebuilds the
6combined `answer + ## Sources + ## Research Metrics` view on demand for
7display and export — chat reads `research.report_content` directly and
8never goes through this module.
10DO NOT write the assembled output back to `research.report_content` —
11that column must stay answer-only. Writing assembled output back would
12silently re-introduce the regex over-strip class of bugs that
13answer-only storage avoids.
14"""
16import re
17from typing import Any, Dict, List, Optional
19from loguru import logger
20from sqlalchemy.orm import Session
22from ...database.models.research import ResearchHistory, ResearchResource
23from ...utilities.search_utilities import format_links_to_markdown
25# Line-anchored regexes for the legacy-row guard. See `assemble_full_report`
26# for why a substring `in body` check is too loose.
27_LEGACY_SOURCES_RE = re.compile(r"^## Sources\b", re.MULTILINE)
28_LEGACY_METRICS_RE = re.compile(r"^## Research Metrics\b", re.MULTILINE)
31def assemble_full_report(
32 research: Optional[ResearchHistory], db_session: Session
33) -> Optional[str]:
34 """Reconstruct the legacy report shape from structured storage.
36 Args:
37 research: The ResearchHistory ORM row. Must be loaded inside
38 the supplied ``db_session`` to avoid DetachedInstanceError
39 when accessing ``research.research_meta`` lazily.
40 db_session: Active SQLAlchemy session bound to the user DB.
41 Used to query ``research_resources`` for the sources block.
43 Returns:
44 ``None`` when ``research`` is ``None`` (caller should map to
45 404). Otherwise assembled markdown: answer + optional
46 ``## Sources`` block + optional ``## Research Metrics`` block.
47 An existing row with no body / sources / metrics returns an
48 empty string ``""`` (a valid empty-but-found response).
49 """
50 if research is None:
51 return None
53 body = research.report_content or ""
55 # Legacy-row guard: older rows already contain inline
56 # `## Sources` / `## Research Metrics` blocks in report_content. If
57 # we appended freshly-assembled sections to those rows we'd render
58 # the blocks twice. Match only at line-start to avoid false positives
59 # from prose that happens to contain the substring `## Sources` inline
60 # (e.g. an answer that quotes another markdown document).
61 has_legacy_sources = bool(_LEGACY_SOURCES_RE.search(body))
62 has_legacy_metrics = bool(_LEGACY_METRICS_RE.search(body))
64 parts = [body]
66 if not has_legacy_sources:
67 # Let any failure propagate: the callers already wrap this in a
68 # try/except that returns HTTP 500. Swallowing it here would emit a
69 # report that looks complete but is silently missing all sources.
70 sources_md = _build_sources_markdown(research, db_session)
71 if sources_md:
72 parts.append("## Sources\n\n" + sources_md)
74 if not has_legacy_metrics:
75 metrics_md = _build_metrics_markdown(research)
76 if metrics_md:
77 parts.append("## Research Metrics\n" + metrics_md)
79 return "\n\n".join(parts)
82def _build_metrics_markdown(research: ResearchHistory) -> str:
83 """Render the Research Metrics block from persisted metadata.
85 Today the inline metrics block (research_service.py quick-summary
86 path) used ``results["iterations"]`` and a fresh save-time
87 timestamp. Both end up in ``research.research_meta`` (the save site
88 persists ``metadata["iterations"]`` and ``metadata["generated_at"]``)
89 so this read recovers the same values. Falls back to
90 ``research.completed_at`` for the timestamp when ``generated_at`` is
91 missing (legacy rows or scheduler-saved research).
93 Returns an empty string when nothing meaningful can be rendered.
94 """
95 meta = research.research_meta or {}
96 iterations = meta.get("iterations")
97 generated_at = meta.get("generated_at") or research.completed_at
98 lines = []
99 if iterations is not None:
100 lines.append(f"- Search Iterations: {iterations}")
101 if generated_at:
102 lines.append(f"- Generated at: {generated_at}")
103 return "\n".join(lines)
106def _build_sources_markdown(
107 research: ResearchHistory, db_session: Session
108) -> str:
109 """Render the Sources block from the ``research_resources`` table.
111 Maps each ResearchResource row back to the dict shape
112 ``format_links_to_markdown`` expects, preferring the original
113 citation index from ``resource_metadata['original_data']['index']``
114 (assigned by the search system at search time, and the number the
115 inline ``[N]`` references in the saved answer point to). Falls
116 back to row order when the original index was lost on save.
117 """
118 resources = (
119 db_session.query(ResearchResource)
120 .filter_by(research_id=research.id)
121 .order_by(ResearchResource.id.asc())
122 .all()
123 )
125 all_links: List[Dict[str, Any]] = []
126 missing_index_count = 0
127 for fallback_idx, r in enumerate(resources, start=1):
128 # Defensive: legacy rows may have stored metadata as a string.
129 meta = (
130 r.resource_metadata if isinstance(r.resource_metadata, dict) else {}
131 )
132 original = (
133 meta.get("original_data")
134 if isinstance(meta.get("original_data"), dict)
135 else {}
136 )
137 # ``is None`` (not ``not``) so 0 isn't treated as missing.
138 index = original.get("index")
139 if index is None or index == "":
140 missing_index_count += 1
141 index = str(fallback_idx)
142 all_links.append(
143 {
144 "url": str(r.url) if r.url else "",
145 "title": str(r.title) if r.title else "Untitled",
146 "index": index,
147 "journal_quality": original.get("journal_quality"),
148 }
149 )
151 if missing_index_count:
152 # DEBUG (not WARNING): expected for legacy rows / URL-less
153 # entries skipped at save time. Render correctness is preserved
154 # via row-order fallback. Bind research_id so the message
155 # routes through the per-research log table.
156 logger.bind(research_id=research.id).debug(
157 "_build_sources_markdown: {} of {} rows missing original "
158 "citation index; using row order. Common cause: URL-less "
159 "entries were skipped at save time.",
160 missing_index_count,
161 len(resources),
162 )
164 return format_links_to_markdown(all_links)
167def get_research_source_links(
168 research_id: str, db_session: Session, limit: int = 3
169) -> List[Dict[str, str]]:
170 """Top-N source links for a research, in row-insertion order.
172 Returns dicts shaped ``{"url": str, "title": str}`` matching the
173 news feed's ``links`` contract (``news/api.py`` consumers). Titles
174 are domain-fallback when missing, truncated to 50 chars to match
175 the existing list-card rendering in the news UI.
177 Args:
178 research_id: The ResearchHistory id.
179 db_session: Active SQLAlchemy session bound to the user DB.
180 limit: Maximum number of links to return.
181 """
182 rows = (
183 db_session.query(ResearchResource)
184 .filter_by(research_id=research_id)
185 .filter(ResearchResource.url.isnot(None))
186 .order_by(ResearchResource.id.asc())
187 .limit(limit)
188 .all()
189 )
190 out: List[Dict[str, str]] = []
191 for r in rows:
192 url = (r.url or "").strip()
193 if not url.startswith("http"):
194 continue
195 title = (r.title or "").strip()
196 if not title:
197 domain = url.split("//")[-1].split("/")[0]
198 title = domain.replace("www.", "")
199 if len(title) > 50: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 title = title[:50] + "..."
201 out.append({"url": url, "title": title})
202 return out
205def get_research_source_links_batch(
206 research_ids: List[str], db_session: Session, limit: Optional[int] = 3
207) -> Dict[str, List[Dict[str, str]]]:
208 """Batched variant of :func:`get_research_source_links`.
210 For news-feed list views that would otherwise fire one query per
211 research item (N+1). One ``WHERE research_id IN (...)`` query plus
212 Python-side grouping. Returned dict maps each research_id to its
213 top-N links (same shape as :func:`get_research_source_links`).
214 Research ids with zero rows map to ``[]``.
216 ``limit=None`` returns every link for each research (no cap) — used by
217 the report API, which exposes the full source list rather than a top-N.
218 """
219 result: Dict[str, List[Dict[str, str]]] = {rid: [] for rid in research_ids}
220 if not research_ids:
221 return result
223 rows = (
224 db_session.query(ResearchResource)
225 .filter(ResearchResource.research_id.in_(research_ids))
226 .filter(ResearchResource.url.isnot(None))
227 .order_by(ResearchResource.research_id, ResearchResource.id.asc())
228 .all()
229 )
230 for r in rows:
231 bucket = result.setdefault(r.research_id, [])
232 if limit is not None and len(bucket) >= limit:
233 continue
234 url = (r.url or "").strip()
235 if not url.startswith("http"): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true
236 continue
237 title = (r.title or "").strip()
238 if not title:
239 domain = url.split("//")[-1].split("/")[0]
240 title = domain.replace("www.", "")
241 if len(title) > 50: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true
242 title = title[:50] + "..."
243 bucket.append({"url": url, "title": title})
244 return result