Coverage for src/local_deep_research/text_optimization/citation_formatter.py: 96%

486 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Citation formatter for adding hyperlinks and alternative citation styles.""" 

2 

3import re 

4from enum import Enum 

5from typing import Any, Dict, List, Tuple 

6from urllib.parse import urlparse 

7 

8_SOURCES_SECTION_PATTERNS = [ 

9 re.compile( 

10 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)", 

11 re.MULTILINE | re.IGNORECASE, 

12 ), 

13 re.compile( 

14 r"^(?:Sources|References|Bibliography|Citations):?\s*$", 

15 re.MULTILINE | re.IGNORECASE, 

16 ), 

17] 

18 

19 

20def find_sources_section(content: str) -> int: 

21 """Find the start position of the sources/references section in *content*. 

22 

23 Returns -1 if no section is found. 

24 """ 

25 for pattern in _SOURCES_SECTION_PATTERNS: 

26 match = pattern.search(content) 

27 if match: 

28 return match.start() 

29 return -1 

30 

31 

32class CitationMode(Enum): 

33 """Available citation formatting modes.""" 

34 

35 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks 

36 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks 

37 DOMAIN_ID_HYPERLINKS = ( 

38 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs 

39 ) 

40 DOMAIN_ID_ALWAYS_HYPERLINKS = ( 

41 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs 

42 ) 

43 SOURCE_TAGGED_HYPERLINKS = "source_tagged_hyperlinks" 

44 """Preserve the global citation number and prefix it with a short source 

45 tag derived from the URL: known academic sources via ``URLClassifier`` 

46 (``arxiv-7``, ``pubmed-3``), domain otherwise (``nytimes.com-9``), and 

47 ``local-N`` for empty / local URLs. Unlike DOMAIN_ID_* modes the 

48 suffix is the original citation number, so labels never collide and 

49 match the bibliography order: ``[1]`` arxiv + ``[2]`` openai + ``[3]`` 

50 arxiv -> ``arxiv-1``, ``openai-2``, ``arxiv-3``.""" 

51 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks 

52 

53 

54class CitationFormatter: 

55 """Formats citations in markdown documents with various styles.""" 

56 

57 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS): 

58 self.mode = mode 

59 # Use negative lookbehind and lookahead to avoid matching already formatted citations 

60 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate 

61 self.citation_pattern = re.compile( 

62 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])" 

63 ) 

64 self.comma_citation_pattern = re.compile( 

65 r"[\[【](\d+(?:,\s*\d+)+)[\]】]" 

66 ) 

67 # Also match "Source X" or "source X" patterns 

68 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b") 

69 self.sources_pattern = re.compile( 

70 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", 

71 re.MULTILINE, 

72 ) 

73 

74 def _create_source_word_replacer(self, formatter_func): 

75 """Create a replacement function for 'Source X' patterns. 

76 

77 Args: 

78 formatter_func: A function that takes citation_num and returns formatted text 

79 

80 Returns: 

81 A replacement function for use with regex sub 

82 """ 

83 

84 def replace_source_word(match): 

85 citation_num = match.group(1) 

86 return formatter_func(citation_num) 

87 

88 return replace_source_word 

89 

90 def _create_citation_formatter(self, sources_dict, format_pattern): 

91 """Create a formatter function for citations. 

92 

93 Args: 

94 sources_dict: Dictionary mapping citation numbers to data 

95 format_pattern: A callable that takes (citation_num, data) and returns formatted string 

96 

97 Returns: 

98 A function that formats citations or returns fallback 

99 """ 

100 

101 def formatter(citation_num): 

102 if citation_num in sources_dict: 

103 data = sources_dict[citation_num] 

104 return format_pattern(citation_num, data) 

105 return f"[{citation_num}]" 

106 

107 return formatter 

108 

109 def _replace_comma_citations(self, content, lookup, format_one): 

110 """Replace comma-separated citations like [1, 2, 3] using *lookup* and *format_one*. 

111 

112 Args: 

113 content: Text to process 

114 lookup: Dict mapping citation number (str) to data 

115 format_one: ``(num, data) -> str`` callback that formats a single citation 

116 """ 

117 

118 def _replacer(match): 

119 nums = [n.strip() for n in match.group(1).split(",")] 

120 parts = [] 

121 for num in nums: 

122 if num in lookup: 

123 parts.append(format_one(num, lookup[num])) 

124 else: 

125 parts.append(f"[{num}]") 

126 return "".join(parts) 

127 

128 return self.comma_citation_pattern.sub(_replacer, content) 

129 

130 def format_document(self, content: str) -> str: 

131 """Format citations and return the concatenated answer + sources blob. 

132 

133 Kept for backward compatibility — most call sites only need the 

134 concatenated string. New code that needs to persist answer-only 

135 should use :meth:`format_document_split` instead so the boundary 

136 is returned explicitly (no re-parsing of the concatenated output). 

137 """ 

138 formatted_answer, sources_md = self.format_document_split(content) 

139 return formatted_answer + sources_md 

140 

141 def format_document_split(self, content: str) -> Tuple[str, str]: 

142 """Format citations and return (answer, sources_md) separately. 

143 

144 The boundary between the LLM's answer and the trailing Sources 

145 section is computed inside this method. Callers that only want 

146 the answer (e.g. the chat-mode save site) get a clean split 

147 without re-applying a regex on concatenated output downstream. 

148 

149 Returns ``(content, "")`` when the formatter is in NO_HYPERLINKS 

150 mode or when no Sources section can be found in ``content``. 

151 """ 

152 if self.mode == CitationMode.NO_HYPERLINKS: 

153 return content, "" 

154 

155 sources_start = self._find_sources_section(content) 

156 if sources_start == -1: 

157 return content, "" 

158 

159 document_content = content[:sources_start] 

160 sources_content = content[sources_start:] 

161 

162 sources = self._parse_sources(sources_content) 

163 

164 if self.mode == CitationMode.NUMBER_HYPERLINKS: 

165 formatted_content = self._format_number_hyperlinks( 

166 document_content, sources 

167 ) 

168 elif self.mode == CitationMode.DOMAIN_HYPERLINKS: 

169 formatted_content = self._format_domain_hyperlinks( 

170 document_content, sources 

171 ) 

172 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS: 

173 formatted_content = self._format_domain_id_hyperlinks( 

174 document_content, sources 

175 ) 

176 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 

177 formatted_content = self._format_domain_id_always_hyperlinks( 

178 document_content, sources 

179 ) 

180 elif self.mode == CitationMode.SOURCE_TAGGED_HYPERLINKS: 180 ↛ 187line 180 didn't jump to line 187 because the condition on line 180 was always true

181 formatted_content = self._format_source_tagged_hyperlinks( 

182 document_content, 

183 sources, 

184 self._parse_collections(sources_content), 

185 ) 

186 else: 

187 formatted_content = document_content 

188 

189 return formatted_content, sources_content 

190 

191 def apply_inline_hyperlinks( 

192 self, content: str, sources: List[Dict[str, Any]] 

193 ) -> str: 

194 """Hyperlink ``[N]`` refs using a structured source list. 

195 

196 Dispatches on ``self.mode`` so the user's chosen citation 

197 format (Settings → Report → Citation Format) is honored on 

198 the fallback path the same way it is in 

199 :meth:`format_document_split`. Inherits all the existing 

200 per-mode guards (lookbehind/lookahead against ``[[1]]``, 

201 comma-list handling like ``[1,2,3]``, ``Source N`` word form, 

202 missing-index pass-through, lenticular bracket support). 

203 

204 Used as the safe fallback at save time when the LLM does NOT 

205 emit a Sources section in its prose — the structured source 

206 list (e.g. ``search_system.all_links_of_system``) is the 

207 canonical source of URLs and indices. 

208 """ 

209 if not content or not sources: 

210 return content or "" 

211 if self.mode == CitationMode.NO_HYPERLINKS: 

212 return content 

213 

214 # Search-engine result dicts use either "url" or "link" for the 

215 # destination — Searxng emits {"link": ..., "title": ..., "snippet": ...} 

216 # (search_engine_searxng.py:538) and other engines use "url". 

217 # Looking up only `s["url"]` silently dropped every Searxng-sourced 

218 # citation, leaving the answer body with plain `[N]` brackets even 

219 # though the Sources section beneath was fully populated. Accept 

220 # both keys so the hyperlink fallback works regardless of engine. 

221 def _src_url(s): 

222 return s.get("url") or s.get("link") or "" 

223 

224 adapted: Dict[str, Tuple[str, str]] = { 

225 str(s["index"]): (s.get("title", "Untitled"), _src_url(s)) 

226 for s in sources 

227 if _src_url(s) and s.get("index") is not None 

228 } 

229 if not adapted: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 return content 

231 

232 # Per-mode dispatch — mirrors format_document_split so the user's 

233 # chosen citation format applies on this fallback path too. 

234 # Previously this was hard-coded to _format_number_hyperlinks, 

235 # which meant chat-mode answers (which always hit this fallback 

236 # because the langgraph-agent synthesis doesn't emit a ## Sources 

237 # block in its prose) ignored the report.citation_format setting 

238 # entirely — every chat answer came out as [[N]](url) even when 

239 # the user picked domain-based or source-tagged formatting. 

240 if self.mode == CitationMode.DOMAIN_HYPERLINKS: 

241 return self._format_domain_hyperlinks(content, adapted) 

242 if self.mode == CitationMode.DOMAIN_ID_HYPERLINKS: 

243 return self._format_domain_id_hyperlinks(content, adapted) 

244 if self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 return self._format_domain_id_always_hyperlinks(content, adapted) 

246 if self.mode == CitationMode.SOURCE_TAGGED_HYPERLINKS: 

247 # Pull collection names off the structured source dicts 

248 # (format_links_to_markdown uses the same shape: 

249 # link["metadata"]["collection_name"]) so the SOURCE_TAGGED 

250 # formatter can surface library/RAG tags as the citation 

251 # label when present. 

252 collections: Dict[str, str] = {} 

253 for s in sources: 

254 idx = s.get("index") 

255 if idx is None: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 continue 

257 meta = s.get("metadata") or {} 

258 coll = meta.get("collection_name") 

259 if coll: 

260 collections.setdefault(str(idx), str(coll)) 

261 return self._format_source_tagged_hyperlinks( 

262 content, adapted, collections 

263 ) 

264 # NUMBER_HYPERLINKS is the default and the catch-all for any 

265 # mode added later that doesn't have an explicit branch above. 

266 return self._format_number_hyperlinks(content, adapted) 

267 

268 def _find_sources_section(self, content: str) -> int: 

269 """Find the start of the sources/references section.""" 

270 return find_sources_section(content) 

271 

272 def _parse_sources( 

273 self, sources_content: str 

274 ) -> Dict[str, Tuple[str, str]]: 

275 """ 

276 Parse sources section to extract citation numbers, titles, and URLs. 

277 

278 Returns: 

279 Dictionary mapping citation number to (title, url) tuple 

280 """ 

281 sources = {} 

282 matches = list(self.sources_pattern.finditer(sources_content)) 

283 

284 for match in matches: 

285 citation_nums_str = match.group(1) 

286 title = match.group(2).strip() 

287 url = match.group(3).strip() if match.group(3) else "" 

288 

289 # Handle comma-separated citation numbers like [36, 3] 

290 # Split by comma and strip whitespace 

291 individual_nums = [ 

292 num.strip() for num in citation_nums_str.split(",") 

293 ] 

294 

295 # Add an entry for each individual number 

296 for num in individual_nums: 

297 sources[num] = (title, url) 

298 

299 return sources 

300 

301 def _format_number_hyperlinks( 

302 self, content: str, sources: Dict[str, Tuple[str, str]] 

303 ) -> str: 

304 """Replace [1] with hyperlinked version where only the number is linked.""" 

305 # Filter sources that have URLs 

306 url_sources = { 

307 num: (title, url) for num, (title, url) in sources.items() if url 

308 } 

309 

310 # Create formatter for citations with number hyperlinks 

311 def format_number_link(citation_num, data): 

312 _, url = data 

313 return f"[[{citation_num}]]({url})" 

314 

315 # Handle comma-separated citations like [1, 2, 3] 

316 content = self._replace_comma_citations( 

317 content, url_sources, format_number_link 

318 ) 

319 

320 formatter = self._create_citation_formatter( 

321 url_sources, format_number_link 

322 ) 

323 

324 # Handle individual citations 

325 def replace_citation(match): 

326 return ( 

327 formatter(match.group(1)) 

328 if match.group(1) in url_sources 

329 else match.group(0) 

330 ) 

331 

332 content = self.citation_pattern.sub(replace_citation, content) 

333 

334 # Also handle "Source X" patterns 

335 return self.source_word_pattern.sub( 

336 self._create_source_word_replacer(formatter), content 

337 ) 

338 

339 def _format_domain_hyperlinks( 

340 self, content: str, sources: Dict[str, Tuple[str, str]] 

341 ) -> str: 

342 """Replace [1] with [domain.com] hyperlinked version.""" 

343 

344 # Filter sources that have URLs 

345 url_sources = { 

346 num: (title, url) for num, (title, url) in sources.items() if url 

347 } 

348 

349 # Create formatter for citations with domain hyperlinks 

350 def format_domain_link(citation_num, data): 

351 _, url = data 

352 domain = self._extract_domain(url) 

353 return f"[[{domain}]]({url})" 

354 

355 # Handle comma-separated citations like [1, 2, 3] 

356 content = self._replace_comma_citations( 

357 content, url_sources, format_domain_link 

358 ) 

359 

360 formatter = self._create_citation_formatter( 

361 url_sources, format_domain_link 

362 ) 

363 

364 # Handle individual citations 

365 def replace_citation(match): 

366 return ( 

367 formatter(match.group(1)) 

368 if match.group(1) in url_sources 

369 else match.group(0) 

370 ) 

371 

372 content = self.citation_pattern.sub(replace_citation, content) 

373 

374 # Also handle "Source X" patterns 

375 return self.source_word_pattern.sub( 

376 self._create_source_word_replacer(formatter), content 

377 ) 

378 

379 def _format_domain_id_hyperlinks( 

380 self, content: str, sources: Dict[str, Tuple[str, str]] 

381 ) -> str: 

382 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs.""" 

383 # First, create a mapping of domains to their citation numbers 

384 domain_citations: dict[str, list[Any]] = {} 

385 

386 for citation_num, (title, url) in sources.items(): 

387 if url: 387 ↛ 386line 387 didn't jump to line 386 because the condition on line 387 was always true

388 domain = self._extract_domain(url) 

389 if domain not in domain_citations: 

390 domain_citations[domain] = [] 

391 domain_citations[domain].append((citation_num, url)) 

392 

393 # Create a mapping from citation number to domain with ID 

394 citation_to_domain_id = {} 

395 for domain, citations in domain_citations.items(): 

396 if len(citations) > 1: 

397 # Multiple citations from same domain - add hyphen and number 

398 for idx, (citation_num, url) in enumerate(citations, 1): 

399 citation_to_domain_id[citation_num] = ( 

400 f"{domain}-{idx}", 

401 url, 

402 ) 

403 else: 

404 # Single citation from domain - no ID needed 

405 citation_num, url = citations[0] 

406 citation_to_domain_id[citation_num] = (domain, url) 

407 

408 # Create formatter for citations with domain_id hyperlinks 

409 def format_domain_id_link(citation_num, data): 

410 domain_id, url = data 

411 return f"[[{domain_id}]]({url})" 

412 

413 # Handle comma-separated citations 

414 content = self._replace_comma_citations( 

415 content, citation_to_domain_id, format_domain_id_link 

416 ) 

417 

418 formatter = self._create_citation_formatter( 

419 citation_to_domain_id, format_domain_id_link 

420 ) 

421 

422 # Handle individual citations 

423 def replace_citation(match): 

424 return ( 

425 formatter(match.group(1)) 

426 if match.group(1) in citation_to_domain_id 

427 else match.group(0) 

428 ) 

429 

430 content = self.citation_pattern.sub(replace_citation, content) 

431 

432 # Also handle "Source X" patterns 

433 return self.source_word_pattern.sub( 

434 self._create_source_word_replacer(formatter), content 

435 ) 

436 

437 def _format_domain_id_always_hyperlinks( 

438 self, content: str, sources: Dict[str, Tuple[str, str]] 

439 ) -> str: 

440 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs.""" 

441 # First, create a mapping of domains to their citation numbers 

442 domain_citations: dict[str, list[Any]] = {} 

443 

444 for citation_num, (title, url) in sources.items(): 

445 if url: 445 ↛ 444line 445 didn't jump to line 444 because the condition on line 445 was always true

446 domain = self._extract_domain(url) 

447 if domain not in domain_citations: 

448 domain_citations[domain] = [] 

449 domain_citations[domain].append((citation_num, url)) 

450 

451 # Create a mapping from citation number to domain with ID 

452 citation_to_domain_id = {} 

453 for domain, citations in domain_citations.items(): 

454 # Always add hyphen and number for consistency 

455 for idx, (citation_num, url) in enumerate(citations, 1): 

456 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url) 

457 

458 # Create formatter for citations with domain_id hyperlinks 

459 def format_domain_id_link(citation_num, data): 

460 domain_id, url = data 

461 return f"[[{domain_id}]]({url})" 

462 

463 # Handle comma-separated citations 

464 content = self._replace_comma_citations( 

465 content, citation_to_domain_id, format_domain_id_link 

466 ) 

467 

468 formatter = self._create_citation_formatter( 

469 citation_to_domain_id, format_domain_id_link 

470 ) 

471 

472 # Handle individual citations 

473 def replace_citation(match): 

474 return ( 

475 formatter(match.group(1)) 

476 if match.group(1) in citation_to_domain_id 

477 else match.group(0) 

478 ) 

479 

480 content = self.citation_pattern.sub(replace_citation, content) 

481 

482 # Also handle "Source X" patterns 

483 return self.source_word_pattern.sub( 

484 self._create_source_word_replacer(formatter), content 

485 ) 

486 

487 # Sources section may carry a "Collection: <name>" line for RAG / 

488 # library hits (emitted by ``utilities/search_utilities.format_links_to_markdown``). 

489 # The line sits between this ``[N]`` entry's ``URL:`` line and the 

490 # next ``[N+1]`` entry. We anchor the match on a non-greedy span up 

491 # to the next citation header (or end of string) to scope correctly. 

492 _collection_line_pattern = re.compile( 

493 r"^\[(\d+(?:,\s*\d+)*)\][^\n]*\n" # the [N] header line 

494 r"(?:[^\n\[]*\n)*?" # any non-[ lines (typically URL: ...) 

495 r"\s*Collection:\s*(.+?)\s*$", 

496 re.MULTILINE, 

497 ) 

498 

499 def _parse_collections(self, sources_content: str) -> Dict[str, str]: 

500 """Extract ``{citation_num: collection_name}`` from a sources 

501 block. Returns an empty dict when no ``Collection:`` lines exist 

502 — the absence of collection info is the common case (web URLs) 

503 and must never raise.""" 

504 collections: Dict[str, str] = {} 

505 for match in self._collection_line_pattern.finditer(sources_content): 

506 citation_nums_str = match.group(1) 

507 collection = match.group(2).strip() 

508 if not collection: 508 ↛ 509line 508 didn't jump to line 509 because the condition on line 508 was never true

509 continue 

510 for num in (n.strip() for n in citation_nums_str.split(",")): 

511 collections[num] = collection 

512 return collections 

513 

514 def _format_source_tagged_hyperlinks( 

515 self, 

516 content: str, 

517 sources: Dict[str, Tuple[str, str]], 

518 collections: Dict[str, str], 

519 ) -> str: 

520 """Replace ``[N]`` with ``[[source-N]](url)``. 

521 

522 ``source`` resolves to (in order): the RAG ``Collection:`` 

523 tag for library hits, the short URLClassifier tag for known 

524 academic sources (``arxiv``, ``pubmed``, ...), the cleaned 

525 domain otherwise, or ``local`` for empty/file URLs. ``N`` is 

526 the original global citation number — labels never collide and 

527 the suffix always matches the bibliography ordering. 

528 

529 Args: 

530 content: Document body (sources section already split off). 

531 sources: ``{citation_num: (title, url)}`` parsed from the 

532 sources block. 

533 collections: ``{citation_num: collection_name}`` parsed from 

534 optional ``Collection:`` lines in the sources block 

535 (empty dict when no library/RAG hits are cited). Wins 

536 over URL-derived tags when present for a given citation. 

537 """ 

538 

539 def format_link(citation_num, data): 

540 _, url = data 

541 label = self._extract_source_label( 

542 url, collection=collections.get(citation_num) 

543 ) 

544 tag = f"{label}-{citation_num}" 

545 # Only emit a hyperlink for http(s) URLs — local/file URLs are 

546 # rendered as plain bracketed tags so the markdown stays clean 

547 # and viewers don't try to navigate to a server-local path. 

548 return ( 

549 f"[[{tag}]]({url})" 

550 if self._is_linkable_url(url) 

551 else f"[{tag}]" 

552 ) 

553 

554 # Handle comma-separated citations like [1, 2, 3] 

555 content = self._replace_comma_citations(content, sources, format_link) 

556 

557 formatter = self._create_citation_formatter(sources, format_link) 

558 

559 # Handle individual citations 

560 def replace_citation(match): 

561 return ( 

562 formatter(match.group(1)) 

563 if match.group(1) in sources 

564 else match.group(0) 

565 ) 

566 

567 content = self.citation_pattern.sub(replace_citation, content) 

568 

569 # Also handle "Source X" patterns 

570 return self.source_word_pattern.sub( 

571 self._create_source_word_replacer(formatter), content 

572 ) 

573 

574 @staticmethod 

575 def _slugify_collection(name: str) -> str: 

576 """Make a user-set collection name safe for inline citations. 

577 

578 Collection names are free-form strings (``"My Papers"``, 

579 ``"team/finance"``). Citations need a compact token that won't 

580 break markdown — strip whitespace, lowercase, replace runs of 

581 non-alphanumeric chars with a single hyphen, trim leading and 

582 trailing hyphens, and fall back to ``"local"`` if the result is 

583 empty. ``-N`` is appended downstream so we strip trailing 

584 hyphens to keep the join clean. 

585 """ 

586 slug = re.sub(r"[^a-z0-9]+", "-", name.strip().lower()).strip("-") 

587 return slug or "local" 

588 

589 @staticmethod 

590 def _is_linkable_url(url: str) -> bool: 

591 """Return True iff ``url`` is a http(s) URL safe to wrap in a 

592 markdown hyperlink. Empty strings and file:// / local: schemes 

593 are not linkable.""" 

594 if not url: 

595 return False 

596 try: 

597 scheme = (urlparse(url).scheme or "").lower() 

598 except (ValueError, AttributeError): 

599 return False 

600 return scheme in ("http", "https") 

601 

602 def _extract_source_label( 

603 self, url: str, collection: str | None = None 

604 ) -> str: 

605 """Return a short source tag for ``url``. 

606 

607 Resolution order: 

608 1. ``collection`` (when supplied) wins outright — RAG / library 

609 hits surface their collection name as the citation tag 

610 (``mypapers``, ``personal-notes``, ...). The renderer in 

611 ``utilities/search_utilities.format_links_to_markdown`` 

612 emits a ``Collection:`` line per source for library results, 

613 which the formatter parses back into this argument. 

614 2. Empty URL or non-http(s) scheme (``file://``, ``local:``, ...) → 

615 ``"local"``. Uniform fallback when no collection name is 

616 available. 

617 3. ``URLClassifier`` matches a known academic source → use the 

618 enum value (``arxiv``, ``pubmed``, ``pmc``, ``biorxiv``, 

619 ``medrxiv``, ``semantic_scholar``, ``doi``). 

620 4. Otherwise → fall back to ``_extract_domain`` (e.g. 

621 ``arxiv.org``, ``nytimes.com``). 

622 """ 

623 if collection: 

624 return self._slugify_collection(collection) 

625 if not url: 

626 return "local" 

627 try: 

628 parsed = urlparse(url) 

629 except (ValueError, AttributeError): 

630 return "local" 

631 scheme = (parsed.scheme or "").lower() 

632 if scheme not in ("http", "https"): 

633 return "local" 

634 

635 # Lazy import to keep the formatter usable when the content_fetcher 

636 # package isn't importable (e.g. minimal test setups). 

637 try: 

638 from ..content_fetcher.url_classifier import URLClassifier, URLType 

639 except ImportError: 

640 return self._extract_domain(url) 

641 

642 url_type = URLClassifier.classify(url) 

643 # Generic HTML/PDF/INVALID → fall back to domain. Everything else 

644 # is a known academic source whose enum value is the short tag. 

645 if url_type in (URLType.HTML, URLType.PDF, URLType.INVALID): 

646 return self._extract_domain(url) 

647 return url_type.value 

648 

649 def _extract_domain(self, url: str) -> str: 

650 """Extract domain name from URL.""" 

651 try: 

652 parsed = urlparse(url) 

653 domain = parsed.netloc 

654 # Remove www. prefix if present 

655 if domain.startswith("www."): 

656 domain = domain[4:] 

657 # Keep known domains as-is 

658 known_domains = { 

659 "arxiv.org": "arxiv.org", 

660 "github.com": "github.com", 

661 "reddit.com": "reddit.com", 

662 "youtube.com": "youtube.com", 

663 "pypi.org": "pypi.org", 

664 "milvus.io": "milvus.io", 

665 "medium.com": "medium.com", 

666 } 

667 

668 for known, display in known_domains.items(): 

669 if known in domain: 

670 return display 

671 

672 # For other domains, extract main domain 

673 parts = domain.split(".") 

674 if len(parts) >= 2: 

675 return ".".join(parts[-2:]) 

676 return domain 

677 except (ValueError, AttributeError): 

678 return "source" 

679 

680 

681class QuartoExporter: 

682 """Export markdown documents to Quarto (.qmd) format.""" 

683 

684 def __init__(self): 

685 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate 

686 self.citation_pattern = re.compile( 

687 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])" 

688 ) 

689 self.comma_citation_pattern = re.compile( 

690 r"[\[【](\d+(?:,\s*\d+)+)[\]】]" 

691 ) 

692 

693 def export_to_quarto(self, content: str, title: str | None = None) -> str: 

694 """ 

695 Convert markdown document to Quarto format. 

696 

697 Args: 

698 content: Markdown content 

699 title: Document title (if None, will extract from content) 

700 

701 Returns: 

702 Quarto formatted content 

703 """ 

704 # Extract title from markdown if not provided 

705 if not title: 

706 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) 

707 title = title_match.group(1) if title_match else "Research Report" 

708 

709 # Create Quarto YAML header 

710 from datetime import datetime, UTC 

711 

712 current_date = datetime.now(UTC).strftime("%Y-%m-%d") 

713 yaml_header = f"""--- 

714title: "{title}" 

715author: "Local Deep Research" 

716date: "{current_date}" 

717format: 

718 html: 

719 toc: true 

720 toc-depth: 3 

721 number-sections: true 

722 pdf: 

723 toc: true 

724 number-sections: true 

725 colorlinks: true 

726bibliography: references.bib 

727csl: apa.csl 

728--- 

729 

730""" 

731 

732 # Process content 

733 processed_content = content 

734 

735 # First handle comma-separated citations like [1, 2, 3] 

736 def replace_comma_citations(match): 

737 citation_nums = match.group(1) 

738 # Split by comma and strip whitespace 

739 nums = [num.strip() for num in citation_nums.split(",")] 

740 refs = [f"@ref{num}" for num in nums] 

741 return f"[{', '.join(refs)}]" 

742 

743 processed_content = self.comma_citation_pattern.sub( 

744 replace_comma_citations, processed_content 

745 ) 

746 

747 # Then convert individual citations to Quarto format [@citation] 

748 def replace_citation(match): 

749 citation_num = match.group(1) 

750 return f"[@ref{citation_num}]" 

751 

752 processed_content = self.citation_pattern.sub( 

753 replace_citation, processed_content 

754 ) 

755 

756 # Generate bibliography file content 

757 bib_content = self._generate_bibliography(content) 

758 

759 # Add note about bibliography file 

760 bibliography_note = ( 

761 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n" 

762 + bib_content 

763 + "\n```\n:::\n" 

764 ) 

765 

766 return yaml_header + processed_content + bibliography_note 

767 

768 def _generate_bibliography(self, content: str) -> str: 

769 """Generate BibTeX bibliography from sources.""" 

770 sources_pattern = re.compile( 

771 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE 

772 ) 

773 

774 bibliography = "" 

775 matches = list(sources_pattern.finditer(content)) 

776 

777 for match in matches: 

778 citation_num = match.group(1) 

779 title = match.group(2).strip() 

780 url = match.group(3).strip() if match.group(3) else "" 

781 

782 # Generate BibTeX entry 

783 bib_entry = f"@misc{{ref{citation_num},\n" 

784 bib_entry += f' title = "{{{title}}}",\n' 

785 if url: 

786 bib_entry += f" url = {{{url}}},\n" 

787 bib_entry += f' howpublished = "\\url{{{url}}}",\n' 

788 bib_entry += f" year = {{{2024}}},\n" 

789 bib_entry += ' note = "Accessed: \\today"\n' 

790 bib_entry += "}\n" 

791 

792 bibliography += bib_entry + "\n" 

793 

794 return bibliography.strip() 

795 

796 

797class RISExporter: 

798 """Export references to RIS format for reference managers like Zotero.""" 

799 

800 def __init__(self): 

801 self.sources_pattern = re.compile( 

802 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", 

803 re.MULTILINE, 

804 ) 

805 

806 def export_to_ris(self, content: str) -> str: 

807 """ 

808 Extract references from markdown and convert to RIS format. 

809 

810 Args: 

811 content: Markdown content with sources 

812 

813 Returns: 

814 RIS formatted references 

815 """ 

816 # Find sources section 

817 sources_start = find_sources_section(content) 

818 if sources_start == -1: 

819 return "" 

820 

821 # Find the end of the first sources section (before any other major section) 

822 sources_content = content[sources_start:] 

823 

824 # Look for the next major section to avoid duplicates 

825 next_section_markers = [ 

826 "\n## ALL SOURCES", 

827 "\n### ALL SOURCES", 

828 "\n## Research Metrics", 

829 "\n### Research Metrics", 

830 "\n## SEARCH QUESTIONS", 

831 "\n### SEARCH QUESTIONS", 

832 "\n## DETAILED FINDINGS", 

833 "\n### DETAILED FINDINGS", 

834 "\n---", # Horizontal rule often separates sections 

835 ] 

836 

837 sources_end = len(sources_content) 

838 for marker in next_section_markers: 

839 pos = sources_content.find(marker) 

840 if pos != -1 and pos < sources_end: 

841 sources_end = pos 

842 

843 sources_content = sources_content[:sources_end] 

844 

845 # Parse sources and generate RIS entries 

846 ris_entries = [] 

847 seen_refs = set() # Track which references we've already processed 

848 

849 # Split sources into individual entries 

850 import re 

851 

852 # Pattern to match each source entry. Accept both ASCII "[N]" and 

853 # lenticular "【N】" openers/closers — the inline citation patterns 

854 # in this file already handle lenticular brackets (some LLMs emit 

855 # them), so the source-list parser must stay consistent or it would 

856 # silently drop lenticular-bracketed source entries. 

857 source_entry_pattern = re.compile( 

858 r"^[\[【](\d+)[\]】]\s*(.+?)(?=^[\[【]\d+[\]】]|\Z)", 

859 re.MULTILINE | re.DOTALL, 

860 ) 

861 

862 for match in source_entry_pattern.finditer(sources_content): 

863 citation_num = match.group(1) 

864 entry_text = match.group(2).strip() 

865 

866 # Extract the title (first line) 

867 lines = entry_text.split("\n") 

868 title = lines[0].strip() 

869 

870 # Extract URL, DOI, and other metadata from subsequent lines 

871 url = "" 

872 metadata = {} 

873 for line in lines[1:]: 

874 line = line.strip() 

875 if line.startswith("URL:"): 

876 url = line[4:].strip() 

877 elif line.startswith("DOI:"): 

878 metadata["doi"] = line[4:].strip() 

879 elif line.startswith("Published in"): 

880 metadata["journal"] = line[12:].strip() 

881 # Add more metadata parsing as needed 

882 elif line: 882 ↛ 873line 882 didn't jump to line 873 because the condition on line 882 was always true

883 # Store other lines as additional metadata 

884 if "additional" not in metadata: 884 ↛ 886line 884 didn't jump to line 886 because the condition on line 884 was always true

885 metadata["additional"] = [] 

886 additional = metadata["additional"] 

887 if isinstance(additional, list): 887 ↛ 873line 887 didn't jump to line 873 because the condition on line 887 was always true

888 additional.append(line) 

889 

890 # Combine title with additional metadata lines for full context 

891 full_text = entry_text 

892 

893 # Create a unique key to avoid duplicates 

894 ref_key = (citation_num, title, url) 

895 if ref_key not in seen_refs: 895 ↛ 862line 895 didn't jump to line 862 because the condition on line 895 was always true

896 seen_refs.add(ref_key) 

897 # Create RIS entry with full text for metadata extraction 

898 ris_entry = self._create_ris_entry( 

899 citation_num, full_text, url, metadata 

900 ) 

901 ris_entries.append(ris_entry) 

902 

903 return "\n".join(ris_entries) 

904 

905 def _create_ris_entry( 

906 self, 

907 ref_id: str, 

908 full_text: str, 

909 url: str = "", 

910 metadata: dict | None = None, 

911 ) -> str: 

912 """Create a single RIS entry.""" 

913 lines = [] 

914 

915 # Parse metadata from full text 

916 import re 

917 

918 if metadata is None: 

919 metadata = {} 

920 

921 # Extract title from first line 

922 lines = full_text.split("\n") 

923 title = lines[0].strip() 

924 

925 # Extract year from full text (looks for 4-digit year) 

926 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text) 

927 year = year_match.group(1) if year_match else None 

928 

929 # Extract authors if present (looks for "by Author1, Author2") 

930 authors_match = re.search( 

931 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE 

932 ) 

933 authors = [] 

934 if authors_match: 

935 authors_text = authors_match.group(1) 

936 # Split by 'and' or ',' 

937 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text) 

938 authors = [a.strip() for a in author_parts if a.strip()] 

939 

940 # Extract DOI from metadata or text 

941 doi = metadata.get("doi") 

942 if not doi: 

943 doi_match = re.search( 

944 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE 

945 ) 

946 doi = doi_match.group(1) if doi_match else None 

947 

948 # Clean title - remove author and metadata info for cleaner title 

949 clean_title = title 

950 if authors_match and authors_match.start() < len(title): 

951 clean_title = ( 

952 title[: authors_match.start()] + title[authors_match.end() :] 

953 if authors_match.end() < len(title) 

954 else title[: authors_match.start()] 

955 ) 

956 clean_title = re.sub( 

957 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE 

958 ) 

959 clean_title = re.sub( 

960 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE 

961 ) 

962 clean_title = re.sub( 

963 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE 

964 ) 

965 clean_title = re.sub( 

966 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE 

967 ) 

968 clean_title = clean_title.strip() 

969 

970 # TY - Type of reference (ELEC for electronic source/website) 

971 lines.append("TY - ELEC") 

972 

973 # ID - Reference ID 

974 lines.append(f"ID - ref{ref_id}") 

975 

976 # TI - Title 

977 lines.append(f"TI - {clean_title if clean_title else title}") 

978 

979 # AU - Authors 

980 for author in authors: 

981 lines.append(f"AU - {author}") 

982 

983 # DO - DOI 

984 if doi: 

985 lines.append(f"DO - {doi}") 

986 

987 # PY - Publication year (if found in title) 

988 if year: 

989 lines.append(f"PY - {year}") 

990 

991 # UR - URL 

992 if url: 

993 lines.append(f"UR - {url}") 

994 

995 # Try to extract domain as publisher 

996 try: 

997 from urllib.parse import urlparse 

998 

999 parsed = urlparse(url) 

1000 domain = parsed.netloc 

1001 if domain.startswith("www."): 

1002 domain = domain[4:] 

1003 # Extract readable publisher name from domain 

1004 if domain == "github.com" or domain.endswith(".github.com"): 

1005 lines.append("PB - GitHub") 

1006 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"): 

1007 lines.append("PB - arXiv") 

1008 elif domain == "reddit.com" or domain.endswith(".reddit.com"): 

1009 lines.append("PB - Reddit") 

1010 elif ( 

1011 domain == "youtube.com" 

1012 or domain == "m.youtube.com" 

1013 or domain.endswith(".youtube.com") 

1014 ): 

1015 lines.append("PB - YouTube") 

1016 elif domain == "medium.com" or domain.endswith(".medium.com"): 

1017 lines.append("PB - Medium") 

1018 elif domain == "pypi.org" or domain.endswith(".pypi.org"): 

1019 lines.append("PB - Python Package Index (PyPI)") 

1020 else: 

1021 # Use domain as publisher 

1022 lines.append(f"PB - {domain}") 

1023 except (ValueError, AttributeError): 

1024 pass 

1025 

1026 # Y1 - Year accessed (current year) 

1027 from datetime import datetime, UTC 

1028 

1029 current_year = datetime.now(UTC).year 

1030 lines.append(f"Y1 - {current_year}") 

1031 

1032 # DA - Date accessed 

1033 current_date = datetime.now(UTC).strftime("%Y/%m/%d") 

1034 lines.append(f"DA - {current_date}") 

1035 

1036 # LA - Language 

1037 lines.append("LA - en") 

1038 

1039 # ER - End of reference 

1040 lines.append("ER - ") 

1041 

1042 return "\n".join(lines) 

1043 

1044 

1045class LaTeXExporter: 

1046 """Export markdown documents to LaTeX format.""" 

1047 

1048 def __init__(self): 

1049 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate 

1050 self.citation_pattern = re.compile(r"[\[【](\d+)[\]】]") 

1051 self.heading_patterns = [ 

1052 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"), 

1053 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"), 

1054 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"), 

1055 ] 

1056 self.emphasis_patterns = [ 

1057 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"), 

1058 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"), 

1059 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"), 

1060 ] 

1061 

1062 def export_to_latex(self, content: str) -> str: 

1063 """ 

1064 Convert markdown document to LaTeX format. 

1065 

1066 Args: 

1067 content: Markdown content 

1068 

1069 Returns: 

1070 LaTeX formatted content 

1071 """ 

1072 latex_content = self._create_latex_header() 

1073 

1074 # Convert markdown to LaTeX 

1075 body_content = content 

1076 

1077 # Escape special LaTeX characters but preserve math mode 

1078 # Split by $ to preserve math sections 

1079 parts = body_content.split("$") 

1080 for i in range(len(parts)): 

1081 # Even indices are outside math mode 

1082 if i % 2 == 0: 

1083 # Only escape if not inside $$ 

1084 if not ( 

1085 i > 0 

1086 and parts[i - 1] == "" 

1087 and i < len(parts) - 1 

1088 and parts[i + 1] == "" 

1089 ): 

1090 # Preserve certain patterns that will be processed later 

1091 # like headings (#), emphasis (*), and citations ([n]) 

1092 lines = parts[i].split("\n") 

1093 for j, line in enumerate(lines): 

1094 # Don't escape lines that start with # (headings) 

1095 if not line.strip().startswith("#"): 

1096 # Don't escape emphasis markers or citations for now 

1097 # They'll be handled by their own patterns 

1098 temp_line = line 

1099 # Escape special chars except *, #, [, ] 

1100 temp_line = temp_line.replace("&", r"\&") 

1101 temp_line = temp_line.replace("%", r"\%") 

1102 temp_line = temp_line.replace("_", r"\_") 

1103 # Don't escape { } inside citations 

1104 lines[j] = temp_line 

1105 parts[i] = "\n".join(lines) 

1106 body_content = "$".join(parts) 

1107 

1108 # Convert headings 

1109 for pattern, replacement in self.heading_patterns: 

1110 body_content = pattern.sub(replacement, body_content) 

1111 

1112 # Convert emphasis 

1113 for pattern, replacement in self.emphasis_patterns: 

1114 body_content = pattern.sub(replacement, body_content) 

1115 

1116 # Convert citations to LaTeX \cite{} format 

1117 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content) 

1118 

1119 # Convert lists 

1120 body_content = self._convert_lists(body_content) 

1121 

1122 # Add body content 

1123 latex_content += body_content 

1124 

1125 # Add bibliography section 

1126 latex_content += self._create_bibliography(content) 

1127 

1128 # Add footer 

1129 latex_content += self._create_latex_footer() 

1130 

1131 return latex_content 

1132 

1133 def _create_latex_header(self) -> str: 

1134 """Create LaTeX document header.""" 

1135 return r"""\documentclass[12pt]{article} 

1136\usepackage[utf8]{inputenc} 

1137\usepackage{hyperref} 

1138\usepackage{cite} 

1139\usepackage{url} 

1140 

1141\title{Research Report} 

1142\date{\today} 

1143 

1144\begin{document} 

1145\maketitle 

1146 

1147""" 

1148 

1149 def _create_latex_footer(self) -> str: 

1150 """Create LaTeX document footer.""" 

1151 return "\n\\end{document}\n" 

1152 

1153 def _escape_latex(self, text: str) -> str: 

1154 """Escape special LaTeX characters in text.""" 

1155 # Escape special LaTeX characters 

1156 replacements = [ 

1157 ("\\", r"\textbackslash{}"), # Must be first 

1158 ("&", r"\&"), 

1159 ("%", r"\%"), 

1160 ("$", r"\$"), 

1161 ("#", r"\#"), 

1162 ("_", r"\_"), 

1163 ("{", r"\{"), 

1164 ("}", r"\}"), 

1165 ("~", r"\textasciitilde{}"), 

1166 ("^", r"\textasciicircum{}"), 

1167 ] 

1168 

1169 for old, new in replacements: 

1170 text = text.replace(old, new) 

1171 

1172 return text 

1173 

1174 def _convert_lists(self, content: str) -> str: 

1175 """Convert markdown lists to LaTeX format.""" 

1176 # Simple conversion for bullet points 

1177 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE) 

1178 

1179 # Add itemize environment around list items 

1180 lines = content.split("\n") 

1181 result = [] 

1182 in_list = False 

1183 

1184 for line in lines: 

1185 if line.strip().startswith("\\item"): 

1186 if not in_list: 

1187 result.append("\\begin{itemize}") 

1188 in_list = True 

1189 result.append(line) 

1190 else: 

1191 if in_list and line.strip(): 

1192 result.append("\\end{itemize}") 

1193 in_list = False 

1194 result.append(line) 

1195 

1196 if in_list: 

1197 result.append("\\end{itemize}") 

1198 

1199 return "\n".join(result) 

1200 

1201 def _create_bibliography(self, content: str) -> str: 

1202 """Extract sources and create LaTeX bibliography.""" 

1203 sources_start = find_sources_section(content) 

1204 if sources_start == -1: 

1205 return "" 

1206 

1207 sources_content = content[sources_start:] 

1208 pattern = re.compile( 

1209 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE 

1210 ) 

1211 

1212 bibliography = "\n\\begin{thebibliography}{99}\n" 

1213 

1214 for match in pattern.finditer(sources_content): 

1215 citation_num = match.group(1) 

1216 title = match.group(2).strip() 

1217 url = match.group(3).strip() if match.group(3) else "" 

1218 

1219 # Escape special LaTeX characters in title 

1220 escaped_title = self._escape_latex(title) 

1221 

1222 if url: 

1223 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n" 

1224 else: 

1225 bibliography += ( 

1226 f"\\bibitem{{{citation_num}}} {escaped_title}.\n" 

1227 ) 

1228 

1229 bibliography += "\\end{thebibliography}\n" 

1230 

1231 return bibliography