Coverage for src/local_deep_research/web/services/report_assembly_service.py: 95%

84 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Reconstruct the legacy "full report" view from structured storage. 

2 

3`research.report_content` stores only the synthesized answer (with inline 

4`[N](url)` hyperlinks). Sources live in the `research_resources` table; 

5metrics live in `research.research_meta`. This module rebuilds the 

6combined `answer + ## Sources + ## Research Metrics` view on demand for 

7display and export — chat reads `research.report_content` directly and 

8never goes through this module. 

9 

10DO NOT write the assembled output back to `research.report_content` — 

11that column must stay answer-only. Writing assembled output back would 

12silently re-introduce the regex over-strip class of bugs that 

13answer-only storage avoids. 

14""" 

15 

16import re 

17from typing import Any, Dict, List, Optional 

18 

19from loguru import logger 

20from sqlalchemy.orm import Session 

21 

22from ...database.models.research import ResearchHistory, ResearchResource 

23from ...utilities.search_utilities import format_links_to_markdown 

24 

25# Line-anchored regexes for the legacy-row guard. See `assemble_full_report` 

26# for why a substring `in body` check is too loose. 

27_LEGACY_SOURCES_RE = re.compile(r"^## Sources\b", re.MULTILINE) 

28_LEGACY_METRICS_RE = re.compile(r"^## Research Metrics\b", re.MULTILINE) 

29 

30 

31def assemble_full_report( 

32 research: Optional[ResearchHistory], db_session: Session 

33) -> Optional[str]: 

34 """Reconstruct the legacy report shape from structured storage. 

35 

36 Args: 

37 research: The ResearchHistory ORM row. Must be loaded inside 

38 the supplied ``db_session`` to avoid DetachedInstanceError 

39 when accessing ``research.research_meta`` lazily. 

40 db_session: Active SQLAlchemy session bound to the user DB. 

41 Used to query ``research_resources`` for the sources block. 

42 

43 Returns: 

44 ``None`` when ``research`` is ``None`` (caller should map to 

45 404). Otherwise assembled markdown: answer + optional 

46 ``## Sources`` block + optional ``## Research Metrics`` block. 

47 An existing row with no body / sources / metrics returns an 

48 empty string ``""`` (a valid empty-but-found response). 

49 """ 

50 if research is None: 

51 return None 

52 

53 body = research.report_content or "" 

54 

55 # Legacy-row guard: older rows already contain inline 

56 # `## Sources` / `## Research Metrics` blocks in report_content. If 

57 # we appended freshly-assembled sections to those rows we'd render 

58 # the blocks twice. Match only at line-start to avoid false positives 

59 # from prose that happens to contain the substring `## Sources` inline 

60 # (e.g. an answer that quotes another markdown document). 

61 has_legacy_sources = bool(_LEGACY_SOURCES_RE.search(body)) 

62 has_legacy_metrics = bool(_LEGACY_METRICS_RE.search(body)) 

63 

64 parts = [body] 

65 

66 if not has_legacy_sources: 

67 # Let any failure propagate: the callers already wrap this in a 

68 # try/except that returns HTTP 500. Swallowing it here would emit a 

69 # report that looks complete but is silently missing all sources. 

70 sources_md = _build_sources_markdown(research, db_session) 

71 if sources_md: 

72 parts.append("## Sources\n\n" + sources_md) 

73 

74 if not has_legacy_metrics: 

75 metrics_md = _build_metrics_markdown(research) 

76 if metrics_md: 

77 parts.append("## Research Metrics\n" + metrics_md) 

78 

79 return "\n\n".join(parts) 

80 

81 

82def _build_metrics_markdown(research: ResearchHistory) -> str: 

83 """Render the Research Metrics block from persisted metadata. 

84 

85 Today the inline metrics block (research_service.py quick-summary 

86 path) used ``results["iterations"]`` and a fresh save-time 

87 timestamp. Both end up in ``research.research_meta`` (the save site 

88 persists ``metadata["iterations"]`` and ``metadata["generated_at"]``) 

89 so this read recovers the same values. Falls back to 

90 ``research.completed_at`` for the timestamp when ``generated_at`` is 

91 missing (legacy rows or scheduler-saved research). 

92 

93 Returns an empty string when nothing meaningful can be rendered. 

94 """ 

95 meta = research.research_meta or {} 

96 iterations = meta.get("iterations") 

97 generated_at = meta.get("generated_at") or research.completed_at 

98 lines = [] 

99 if iterations is not None: 

100 lines.append(f"- Search Iterations: {iterations}") 

101 if generated_at: 

102 lines.append(f"- Generated at: {generated_at}") 

103 return "\n".join(lines) 

104 

105 

106def _build_sources_markdown( 

107 research: ResearchHistory, db_session: Session 

108) -> str: 

109 """Render the Sources block from the ``research_resources`` table. 

110 

111 Maps each ResearchResource row back to the dict shape 

112 ``format_links_to_markdown`` expects, preferring the original 

113 citation index from ``resource_metadata['original_data']['index']`` 

114 (assigned by the search system at search time, and the number the 

115 inline ``[N]`` references in the saved answer point to). Falls 

116 back to row order when the original index was lost on save. 

117 """ 

118 resources = ( 

119 db_session.query(ResearchResource) 

120 .filter_by(research_id=research.id) 

121 .order_by(ResearchResource.id.asc()) 

122 .all() 

123 ) 

124 

125 all_links: List[Dict[str, Any]] = [] 

126 missing_index_count = 0 

127 for fallback_idx, r in enumerate(resources, start=1): 

128 # Defensive: legacy rows may have stored metadata as a string. 

129 meta = ( 

130 r.resource_metadata if isinstance(r.resource_metadata, dict) else {} 

131 ) 

132 original = ( 

133 meta.get("original_data") 

134 if isinstance(meta.get("original_data"), dict) 

135 else {} 

136 ) 

137 # ``is None`` (not ``not``) so 0 isn't treated as missing. 

138 index = original.get("index") 

139 if index is None or index == "": 

140 missing_index_count += 1 

141 index = str(fallback_idx) 

142 all_links.append( 

143 { 

144 "url": str(r.url) if r.url else "", 

145 "title": str(r.title) if r.title else "Untitled", 

146 "index": index, 

147 "journal_quality": original.get("journal_quality"), 

148 } 

149 ) 

150 

151 if missing_index_count: 

152 # DEBUG (not WARNING): expected for legacy rows / URL-less 

153 # entries skipped at save time. Render correctness is preserved 

154 # via row-order fallback. Bind research_id so the message 

155 # routes through the per-research log table. 

156 logger.bind(research_id=research.id).debug( 

157 "_build_sources_markdown: {} of {} rows missing original " 

158 "citation index; using row order. Common cause: URL-less " 

159 "entries were skipped at save time.", 

160 missing_index_count, 

161 len(resources), 

162 ) 

163 

164 return format_links_to_markdown(all_links) 

165 

166 

167def get_research_source_links( 

168 research_id: str, db_session: Session, limit: int = 3 

169) -> List[Dict[str, str]]: 

170 """Top-N source links for a research, in row-insertion order. 

171 

172 Returns dicts shaped ``{"url": str, "title": str}`` matching the 

173 news feed's ``links`` contract (``news/api.py`` consumers). Titles 

174 are domain-fallback when missing, truncated to 50 chars to match 

175 the existing list-card rendering in the news UI. 

176 

177 Args: 

178 research_id: The ResearchHistory id. 

179 db_session: Active SQLAlchemy session bound to the user DB. 

180 limit: Maximum number of links to return. 

181 """ 

182 rows = ( 

183 db_session.query(ResearchResource) 

184 .filter_by(research_id=research_id) 

185 .filter(ResearchResource.url.isnot(None)) 

186 .order_by(ResearchResource.id.asc()) 

187 .limit(limit) 

188 .all() 

189 ) 

190 out: List[Dict[str, str]] = [] 

191 for r in rows: 

192 url = (r.url or "").strip() 

193 if not url.startswith("http"): 

194 continue 

195 title = (r.title or "").strip() 

196 if not title: 

197 domain = url.split("//")[-1].split("/")[0] 

198 title = domain.replace("www.", "") 

199 if len(title) > 50: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 title = title[:50] + "..." 

201 out.append({"url": url, "title": title}) 

202 return out 

203 

204 

205def get_research_source_links_batch( 

206 research_ids: List[str], db_session: Session, limit: Optional[int] = 3 

207) -> Dict[str, List[Dict[str, str]]]: 

208 """Batched variant of :func:`get_research_source_links`. 

209 

210 For news-feed list views that would otherwise fire one query per 

211 research item (N+1). One ``WHERE research_id IN (...)`` query plus 

212 Python-side grouping. Returned dict maps each research_id to its 

213 top-N links (same shape as :func:`get_research_source_links`). 

214 Research ids with zero rows map to ``[]``. 

215 

216 ``limit=None`` returns every link for each research (no cap) — used by 

217 the report API, which exposes the full source list rather than a top-N. 

218 """ 

219 result: Dict[str, List[Dict[str, str]]] = {rid: [] for rid in research_ids} 

220 if not research_ids: 

221 return result 

222 

223 rows = ( 

224 db_session.query(ResearchResource) 

225 .filter(ResearchResource.research_id.in_(research_ids)) 

226 .filter(ResearchResource.url.isnot(None)) 

227 .order_by(ResearchResource.research_id, ResearchResource.id.asc()) 

228 .all() 

229 ) 

230 for r in rows: 

231 bucket = result.setdefault(r.research_id, []) 

232 if limit is not None and len(bucket) >= limit: 

233 continue 

234 url = (r.url or "").strip() 

235 if not url.startswith("http"): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true

236 continue 

237 title = (r.title or "").strip() 

238 if not title: 

239 domain = url.split("//")[-1].split("/")[0] 

240 title = domain.replace("www.", "") 

241 if len(title) > 50: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true

242 title = title[:50] + "..." 

243 bucket.append({"url": url, "title": title}) 

244 return result