Coverage for src/local_deep_research/web/services/report_assembly

1"""Reconstruct the legacy "full report" view from structured storage.

3`research.report_content` stores only the synthesized answer (with inline

4`[N](url)` hyperlinks). Sources live in the `research_resources` table;

5metrics live in `research.research_meta`. This module rebuilds the

6combined `answer + ## Sources + ## Research Metrics` view on demand for

7display and export — chat reads `research.report_content` directly and

8never goes through this module.

10DO NOT write the assembled output back to `research.report_content` —

11that column must stay answer-only. Writing assembled output back would

12silently re-introduce the regex over-strip class of bugs that

13answer-only storage avoids.

14"""

16import re

17from typing import Any, Dict, List, Optional

19from loguru import logger

20from sqlalchemy.orm import Session

22from ...database.models.research import ResearchHistory, ResearchResource

23from ...utilities.search_utilities import format_links_to_markdown

25# Line-anchored regexes for the legacy-row guard. See `assemble_full_report`

26# for why a substring `in body` check is too loose.

27_LEGACY_SOURCES_RE = re.compile(r"^## Sources\b", re.MULTILINE)

28_LEGACY_METRICS_RE = re.compile(r"^## Research Metrics\b", re.MULTILINE)

31def assemble_full_report(

32 research: Optional[ResearchHistory], db_session: Session

33) -> Optional[str]:

34 """Reconstruct the legacy report shape from structured storage.

36 Args:

37 research: The ResearchHistory ORM row. Must be loaded inside

38 the supplied ``db_session`` to avoid DetachedInstanceError

39 when accessing ``research.research_meta`` lazily.

40 db_session: Active SQLAlchemy session bound to the user DB.

41 Used to query ``research_resources`` for the sources block.

43 Returns:

44 ``None`` when ``research`` is ``None`` (caller should map to

45 404). Otherwise assembled markdown: answer + optional

46 ``## Sources`` block + optional ``## Research Metrics`` block.

47 An existing row with no body / sources / metrics returns an

48 empty string ``""`` (a valid empty-but-found response).

49 """

50 if research is None:

51 return None

53 body = research.report_content or ""

55 # Legacy-row guard: older rows already contain inline

56 # `## Sources` / `## Research Metrics` blocks in report_content. If

57 # we appended freshly-assembled sections to those rows we'd render

58 # the blocks twice. Match only at line-start to avoid false positives

59 # from prose that happens to contain the substring `## Sources` inline

60 # (e.g. an answer that quotes another markdown document).

61 has_legacy_sources = bool(_LEGACY_SOURCES_RE.search(body))

62 has_legacy_metrics = bool(_LEGACY_METRICS_RE.search(body))

64 parts = [body]

66 if not has_legacy_sources:

67 # Let any failure propagate: the callers already wrap this in a

68 # try/except that returns HTTP 500. Swallowing it here would emit a

69 # report that looks complete but is silently missing all sources.

70 sources_md = _build_sources_markdown(research, db_session)

71 if sources_md:

72 parts.append("## Sources\n\n" + sources_md)

74 if not has_legacy_metrics:

75 metrics_md = _build_metrics_markdown(research)

76 if metrics_md:

77 parts.append("## Research Metrics\n" + metrics_md)

79 return "\n\n".join(parts)

82def _build_metrics_markdown(research: ResearchHistory) -> str:

83 """Render the Research Metrics block from persisted metadata.

85 Today the inline metrics block (research_service.py quick-summary

86 path) used ``results["iterations"]`` and a fresh save-time

87 timestamp. Both end up in ``research.research_meta`` (the save site

88 persists ``metadata["iterations"]`` and ``metadata["generated_at"]``)

89 so this read recovers the same values. Falls back to

90 ``research.completed_at`` for the timestamp when ``generated_at`` is

91 missing (legacy rows or scheduler-saved research).

93 Returns an empty string when nothing meaningful can be rendered.

94 """

95 meta = research.research_meta or {}

96 iterations = meta.get("iterations")

97 generated_at = meta.get("generated_at") or research.completed_at

98 lines = []

99 if iterations is not None:

100 lines.append(f"- Search Iterations: {iterations}")

101 if generated_at:

102 lines.append(f"- Generated at: {generated_at}")

103 return "\n".join(lines)

104

105

106def _build_sources_markdown(

107 research: ResearchHistory, db_session: Session

108) -> str:

109 """Render the Sources block from the ``research_resources`` table.

110

111 Maps each ResearchResource row back to the dict shape

112 ``format_links_to_markdown`` expects, preferring the original

113 citation index from ``resource_metadata['original_data']['index']``

114 (assigned by the search system at search time, and the number the

115 inline ``[N]`` references in the saved answer point to). Falls

116 back to row order when the original index was lost on save.

117 """

118 resources = (

119 db_session.query(ResearchResource)

120 .filter_by(research_id=research.id)

121 .order_by(ResearchResource.id.asc())

122 .all()

123 )

124

125 all_links: List[Dict[str, Any]] = []

126 missing_index_count = 0

127 for fallback_idx, r in enumerate(resources, start=1):

128 # Defensive: legacy rows may have stored metadata as a string.

129 meta = (

130 r.resource_metadata if isinstance(r.resource_metadata, dict) else {}

131 )

132 original = (

133 meta.get("original_data")

134 if isinstance(meta.get("original_data"), dict)

135 else {}

136 )

137 # ``is None`` (not ``not``) so 0 isn't treated as missing.

138 index = original.get("index")

139 if index is None or index == "":

140 missing_index_count += 1

141 index = str(fallback_idx)

142 all_links.append(

143 {

144 "url": str(r.url) if r.url else "",

145 "title": str(r.title) if r.title else "Untitled",

146 "index": index,

147 "journal_quality": original.get("journal_quality"),

148 }

149 )

150

151 if missing_index_count:

152 # DEBUG (not WARNING): expected for legacy rows / URL-less

153 # entries skipped at save time. Render correctness is preserved

154 # via row-order fallback. Bind research_id so the message

155 # routes through the per-research log table.

156 logger.bind(research_id=research.id).debug(

157 "_build_sources_markdown: {} of {} rows missing original "

158 "citation index; using row order. Common cause: URL-less "

159 "entries were skipped at save time.",

160 missing_index_count,

161 len(resources),

162 )

163

164 return format_links_to_markdown(all_links)

165

166

167def get_research_source_links(

168 research_id: str, db_session: Session, limit: int = 3

169) -> List[Dict[str, str]]:

170 """Top-N source links for a research, in row-insertion order.

171

172 Returns dicts shaped ``{"url": str, "title": str}`` matching the

173 news feed's ``links`` contract (``news/api.py`` consumers). Titles

174 are domain-fallback when missing, truncated to 50 chars to match

175 the existing list-card rendering in the news UI.

176

177 Args:

178 research_id: The ResearchHistory id.

179 db_session: Active SQLAlchemy session bound to the user DB.

180 limit: Maximum number of links to return.

181 """

182 rows = (

183 db_session.query(ResearchResource)

184 .filter_by(research_id=research_id)

185 .filter(ResearchResource.url.isnot(None))

186 .order_by(ResearchResource.id.asc())

187 .limit(limit)

188 .all()

189 )

190 out: List[Dict[str, str]] = []

191 for r in rows:

192 url = (r.url or "").strip()

193 if not url.startswith("http"):

194 continue

195 title = (r.title or "").strip()

196 if not title:

197 domain = url.split("//")[-1].split("/")[0]

198 title = domain.replace("www.", "")

199 if len(title) > 50: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 title = title[:50] + "..."

201 out.append({"url": url, "title": title})

202 return out

203

204

205def get_research_source_links_batch(

206 research_ids: List[str], db_session: Session, limit: Optional[int] = 3

207) -> Dict[str, List[Dict[str, str]]]:

208 """Batched variant of :func:`get_research_source_links`.

209

210 For news-feed list views that would otherwise fire one query per

211 research item (N+1). One ``WHERE research_id IN (...)`` query plus

212 Python-side grouping. Returned dict maps each research_id to its

213 top-N links (same shape as :func:`get_research_source_links`).

214 Research ids with zero rows map to ``[]``.

215

216 ``limit=None`` returns every link for each research (no cap) — used by

217 the report API, which exposes the full source list rather than a top-N.

218 """

219 result: Dict[str, List[Dict[str, str]]] = {rid: [] for rid in research_ids}

220 if not research_ids:

221 return result

222

223 rows = (

224 db_session.query(ResearchResource)

225 .filter(ResearchResource.research_id.in_(research_ids))

226 .filter(ResearchResource.url.isnot(None))

227 .order_by(ResearchResource.research_id, ResearchResource.id.asc())

228 .all()

229 )

230 for r in rows:

231 bucket = result.setdefault(r.research_id, [])

232 if limit is not None and len(bucket) >= limit:

233 continue

234 url = (r.url or "").strip()

235 if not url.startswith("http"):

236 continue

237 title = (r.title or "").strip()

238 if not title:

239 domain = url.split("//")[-1].split("/")[0]

240 title = domain.replace("www.", "")

241 if len(title) > 50: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true

242 title = title[:50] + "..."

243 bucket.append({"url": url, "title": title})

244 return result

Coverage for src/local_deep_research/web/services/report_assembly_service.py: 97%

84 statements