Coverage for src / local_deep_research / text_optimization / citation_formatter.py: 97%

396 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1"""Citation formatter for adding hyperlinks and alternative citation styles.""" 

2 

3import re 

4from enum import Enum 

5from typing import Dict, Tuple 

6from urllib.parse import urlparse 

7 

8_SOURCES_SECTION_PATTERNS = [ 

9 re.compile( 

10 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)", 

11 re.MULTILINE | re.IGNORECASE, 

12 ), 

13 re.compile( 

14 r"^(?:Sources|References|Bibliography|Citations):?\s*$", 

15 re.MULTILINE | re.IGNORECASE, 

16 ), 

17] 

18 

19 

20def find_sources_section(content: str) -> int: 

21 """Find the start position of the sources/references section in *content*. 

22 

23 Returns -1 if no section is found. 

24 """ 

25 for pattern in _SOURCES_SECTION_PATTERNS: 

26 match = pattern.search(content) 

27 if match: 

28 return match.start() 

29 return -1 

30 

31 

32class CitationMode(Enum): 

33 """Available citation formatting modes.""" 

34 

35 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks 

36 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks 

37 DOMAIN_ID_HYPERLINKS = ( 

38 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs 

39 ) 

40 DOMAIN_ID_ALWAYS_HYPERLINKS = ( 

41 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs 

42 ) 

43 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks 

44 

45 

46class CitationFormatter: 

47 """Formats citations in markdown documents with various styles.""" 

48 

49 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS): 

50 self.mode = mode 

51 # Use negative lookbehind and lookahead to avoid matching already formatted citations 

52 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate 

53 self.citation_pattern = re.compile( 

54 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])" 

55 ) 

56 self.comma_citation_pattern = re.compile( 

57 r"[\[【](\d+(?:,\s*\d+)+)[\]】]" 

58 ) 

59 # Also match "Source X" or "source X" patterns 

60 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b") 

61 self.sources_pattern = re.compile( 

62 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", 

63 re.MULTILINE, 

64 ) 

65 

66 def _create_source_word_replacer(self, formatter_func): 

67 """Create a replacement function for 'Source X' patterns. 

68 

69 Args: 

70 formatter_func: A function that takes citation_num and returns formatted text 

71 

72 Returns: 

73 A replacement function for use with regex sub 

74 """ 

75 

76 def replace_source_word(match): 

77 citation_num = match.group(1) 

78 return formatter_func(citation_num) 

79 

80 return replace_source_word 

81 

82 def _create_citation_formatter(self, sources_dict, format_pattern): 

83 """Create a formatter function for citations. 

84 

85 Args: 

86 sources_dict: Dictionary mapping citation numbers to data 

87 format_pattern: A callable that takes (citation_num, data) and returns formatted string 

88 

89 Returns: 

90 A function that formats citations or returns fallback 

91 """ 

92 

93 def formatter(citation_num): 

94 if citation_num in sources_dict: 

95 data = sources_dict[citation_num] 

96 return format_pattern(citation_num, data) 

97 return f"[{citation_num}]" 

98 

99 return formatter 

100 

101 def _replace_comma_citations(self, content, lookup, format_one): 

102 """Replace comma-separated citations like [1, 2, 3] using *lookup* and *format_one*. 

103 

104 Args: 

105 content: Text to process 

106 lookup: Dict mapping citation number (str) to data 

107 format_one: ``(num, data) -> str`` callback that formats a single citation 

108 """ 

109 

110 def _replacer(match): 

111 nums = [n.strip() for n in match.group(1).split(",")] 

112 parts = [] 

113 for num in nums: 

114 if num in lookup: 

115 parts.append(format_one(num, lookup[num])) 

116 else: 

117 parts.append(f"[{num}]") 

118 return "".join(parts) 

119 

120 return self.comma_citation_pattern.sub(_replacer, content) 

121 

122 def format_document(self, content: str) -> str: 

123 """ 

124 Format citations in the document according to the selected mode. 

125 

126 Args: 

127 content: The markdown content to format 

128 

129 Returns: 

130 Formatted markdown content 

131 """ 

132 if self.mode == CitationMode.NO_HYPERLINKS: 

133 return content 

134 

135 # Extract sources section 

136 sources_start = self._find_sources_section(content) 

137 if sources_start == -1: 

138 return content 

139 

140 document_content = content[:sources_start] 

141 sources_content = content[sources_start:] 

142 

143 # Parse sources 

144 sources = self._parse_sources(sources_content) 

145 

146 # Format citations in document 

147 if self.mode == CitationMode.NUMBER_HYPERLINKS: 

148 formatted_content = self._format_number_hyperlinks( 

149 document_content, sources 

150 ) 

151 elif self.mode == CitationMode.DOMAIN_HYPERLINKS: 

152 formatted_content = self._format_domain_hyperlinks( 

153 document_content, sources 

154 ) 

155 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS: 

156 formatted_content = self._format_domain_id_hyperlinks( 

157 document_content, sources 

158 ) 

159 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true

160 formatted_content = self._format_domain_id_always_hyperlinks( 

161 document_content, sources 

162 ) 

163 else: 

164 formatted_content = document_content 

165 

166 # Rebuild document 

167 return formatted_content + sources_content 

168 

169 def _find_sources_section(self, content: str) -> int: 

170 """Find the start of the sources/references section.""" 

171 return find_sources_section(content) 

172 

173 def _parse_sources( 

174 self, sources_content: str 

175 ) -> Dict[str, Tuple[str, str]]: 

176 """ 

177 Parse sources section to extract citation numbers, titles, and URLs. 

178 

179 Returns: 

180 Dictionary mapping citation number to (title, url) tuple 

181 """ 

182 sources = {} 

183 matches = list(self.sources_pattern.finditer(sources_content)) 

184 

185 for match in matches: 

186 citation_nums_str = match.group(1) 

187 title = match.group(2).strip() 

188 url = match.group(3).strip() if match.group(3) else "" 

189 

190 # Handle comma-separated citation numbers like [36, 3] 

191 # Split by comma and strip whitespace 

192 individual_nums = [ 

193 num.strip() for num in citation_nums_str.split(",") 

194 ] 

195 

196 # Add an entry for each individual number 

197 for num in individual_nums: 

198 sources[num] = (title, url) 

199 

200 return sources 

201 

202 def _format_number_hyperlinks( 

203 self, content: str, sources: Dict[str, Tuple[str, str]] 

204 ) -> str: 

205 """Replace [1] with hyperlinked version where only the number is linked.""" 

206 # Filter sources that have URLs 

207 url_sources = { 

208 num: (title, url) for num, (title, url) in sources.items() if url 

209 } 

210 

211 # Create formatter for citations with number hyperlinks 

212 def format_number_link(citation_num, data): 

213 _, url = data 

214 return f"[[{citation_num}]]({url})" 

215 

216 # Handle comma-separated citations like [1, 2, 3] 

217 content = self._replace_comma_citations( 

218 content, url_sources, format_number_link 

219 ) 

220 

221 formatter = self._create_citation_formatter( 

222 url_sources, format_number_link 

223 ) 

224 

225 # Handle individual citations 

226 def replace_citation(match): 

227 return ( 

228 formatter(match.group(1)) 

229 if match.group(1) in url_sources 

230 else match.group(0) 

231 ) 

232 

233 content = self.citation_pattern.sub(replace_citation, content) 

234 

235 # Also handle "Source X" patterns 

236 return self.source_word_pattern.sub( 

237 self._create_source_word_replacer(formatter), content 

238 ) 

239 

240 def _format_domain_hyperlinks( 

241 self, content: str, sources: Dict[str, Tuple[str, str]] 

242 ) -> str: 

243 """Replace [1] with [domain.com] hyperlinked version.""" 

244 

245 # Filter sources that have URLs 

246 url_sources = { 

247 num: (title, url) for num, (title, url) in sources.items() if url 

248 } 

249 

250 # Create formatter for citations with domain hyperlinks 

251 def format_domain_link(citation_num, data): 

252 _, url = data 

253 domain = self._extract_domain(url) 

254 return f"[[{domain}]]({url})" 

255 

256 # Handle comma-separated citations like [1, 2, 3] 

257 content = self._replace_comma_citations( 

258 content, url_sources, format_domain_link 

259 ) 

260 

261 formatter = self._create_citation_formatter( 

262 url_sources, format_domain_link 

263 ) 

264 

265 # Handle individual citations 

266 def replace_citation(match): 

267 return ( 

268 formatter(match.group(1)) 

269 if match.group(1) in url_sources 

270 else match.group(0) 

271 ) 

272 

273 content = self.citation_pattern.sub(replace_citation, content) 

274 

275 # Also handle "Source X" patterns 

276 return self.source_word_pattern.sub( 

277 self._create_source_word_replacer(formatter), content 

278 ) 

279 

280 def _format_domain_id_hyperlinks( 

281 self, content: str, sources: Dict[str, Tuple[str, str]] 

282 ) -> str: 

283 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs.""" 

284 # First, create a mapping of domains to their citation numbers 

285 domain_citations = {} 

286 

287 for citation_num, (title, url) in sources.items(): 

288 if url: 288 ↛ 287line 288 didn't jump to line 287 because the condition on line 288 was always true

289 domain = self._extract_domain(url) 

290 if domain not in domain_citations: 

291 domain_citations[domain] = [] 

292 domain_citations[domain].append((citation_num, url)) 

293 

294 # Create a mapping from citation number to domain with ID 

295 citation_to_domain_id = {} 

296 for domain, citations in domain_citations.items(): 

297 if len(citations) > 1: 

298 # Multiple citations from same domain - add hyphen and number 

299 for idx, (citation_num, url) in enumerate(citations, 1): 

300 citation_to_domain_id[citation_num] = ( 

301 f"{domain}-{idx}", 

302 url, 

303 ) 

304 else: 

305 # Single citation from domain - no ID needed 

306 citation_num, url = citations[0] 

307 citation_to_domain_id[citation_num] = (domain, url) 

308 

309 # Create formatter for citations with domain_id hyperlinks 

310 def format_domain_id_link(citation_num, data): 

311 domain_id, url = data 

312 return f"[[{domain_id}]]({url})" 

313 

314 # Handle comma-separated citations 

315 content = self._replace_comma_citations( 

316 content, citation_to_domain_id, format_domain_id_link 

317 ) 

318 

319 formatter = self._create_citation_formatter( 

320 citation_to_domain_id, format_domain_id_link 

321 ) 

322 

323 # Handle individual citations 

324 def replace_citation(match): 

325 return ( 

326 formatter(match.group(1)) 

327 if match.group(1) in citation_to_domain_id 

328 else match.group(0) 

329 ) 

330 

331 content = self.citation_pattern.sub(replace_citation, content) 

332 

333 # Also handle "Source X" patterns 

334 return self.source_word_pattern.sub( 

335 self._create_source_word_replacer(formatter), content 

336 ) 

337 

338 def _format_domain_id_always_hyperlinks( 

339 self, content: str, sources: Dict[str, Tuple[str, str]] 

340 ) -> str: 

341 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs.""" 

342 # First, create a mapping of domains to their citation numbers 

343 domain_citations = {} 

344 

345 for citation_num, (title, url) in sources.items(): 

346 if url: 346 ↛ 345line 346 didn't jump to line 345 because the condition on line 346 was always true

347 domain = self._extract_domain(url) 

348 if domain not in domain_citations: 

349 domain_citations[domain] = [] 

350 domain_citations[domain].append((citation_num, url)) 

351 

352 # Create a mapping from citation number to domain with ID 

353 citation_to_domain_id = {} 

354 for domain, citations in domain_citations.items(): 

355 # Always add hyphen and number for consistency 

356 for idx, (citation_num, url) in enumerate(citations, 1): 

357 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url) 

358 

359 # Create formatter for citations with domain_id hyperlinks 

360 def format_domain_id_link(citation_num, data): 

361 domain_id, url = data 

362 return f"[[{domain_id}]]({url})" 

363 

364 # Handle comma-separated citations 

365 content = self._replace_comma_citations( 

366 content, citation_to_domain_id, format_domain_id_link 

367 ) 

368 

369 formatter = self._create_citation_formatter( 

370 citation_to_domain_id, format_domain_id_link 

371 ) 

372 

373 # Handle individual citations 

374 def replace_citation(match): 

375 return ( 

376 formatter(match.group(1)) 

377 if match.group(1) in citation_to_domain_id 

378 else match.group(0) 

379 ) 

380 

381 content = self.citation_pattern.sub(replace_citation, content) 

382 

383 # Also handle "Source X" patterns 

384 return self.source_word_pattern.sub( 

385 self._create_source_word_replacer(formatter), content 

386 ) 

387 

388 def _to_superscript(self, text: str) -> str: 

389 """Convert text to Unicode superscript.""" 

390 superscript_map = { 

391 "0": "⁰", 

392 "1": "¹", 

393 "2": "²", 

394 "3": "³", 

395 "4": "⁴", 

396 "5": "⁵", 

397 "6": "⁶", 

398 "7": "⁷", 

399 "8": "⁸", 

400 "9": "⁹", 

401 } 

402 return "".join(superscript_map.get(c, c) for c in text) 

403 

404 def _extract_domain(self, url: str) -> str: 

405 """Extract domain name from URL.""" 

406 try: 

407 parsed = urlparse(url) 

408 domain = parsed.netloc 

409 # Remove www. prefix if present 

410 if domain.startswith("www."): 

411 domain = domain[4:] 

412 # Keep known domains as-is 

413 known_domains = { 

414 "arxiv.org": "arxiv.org", 

415 "github.com": "github.com", 

416 "reddit.com": "reddit.com", 

417 "youtube.com": "youtube.com", 

418 "pypi.org": "pypi.org", 

419 "milvus.io": "milvus.io", 

420 "medium.com": "medium.com", 

421 } 

422 

423 for known, display in known_domains.items(): 

424 if known in domain: 

425 return display 

426 

427 # For other domains, extract main domain 

428 parts = domain.split(".") 

429 if len(parts) >= 2: 

430 return ".".join(parts[-2:]) 

431 return domain 

432 except (ValueError, AttributeError): 

433 return "source" 

434 

435 

436class QuartoExporter: 

437 """Export markdown documents to Quarto (.qmd) format.""" 

438 

439 def __init__(self): 

440 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate 

441 self.citation_pattern = re.compile( 

442 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])" 

443 ) 

444 self.comma_citation_pattern = re.compile( 

445 r"[\[【](\d+(?:,\s*\d+)+)[\]】]" 

446 ) 

447 

448 def export_to_quarto(self, content: str, title: str = None) -> str: 

449 """ 

450 Convert markdown document to Quarto format. 

451 

452 Args: 

453 content: Markdown content 

454 title: Document title (if None, will extract from content) 

455 

456 Returns: 

457 Quarto formatted content 

458 """ 

459 # Extract title from markdown if not provided 

460 if not title: 

461 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) 

462 title = title_match.group(1) if title_match else "Research Report" 

463 

464 # Create Quarto YAML header 

465 from datetime import datetime, UTC 

466 

467 current_date = datetime.now(UTC).strftime("%Y-%m-%d") 

468 yaml_header = f"""--- 

469title: "{title}" 

470author: "Local Deep Research" 

471date: "{current_date}" 

472format: 

473 html: 

474 toc: true 

475 toc-depth: 3 

476 number-sections: true 

477 pdf: 

478 toc: true 

479 number-sections: true 

480 colorlinks: true 

481bibliography: references.bib 

482csl: apa.csl 

483--- 

484 

485""" 

486 

487 # Process content 

488 processed_content = content 

489 

490 # First handle comma-separated citations like [1, 2, 3] 

491 def replace_comma_citations(match): 

492 citation_nums = match.group(1) 

493 # Split by comma and strip whitespace 

494 nums = [num.strip() for num in citation_nums.split(",")] 

495 refs = [f"@ref{num}" for num in nums] 

496 return f"[{', '.join(refs)}]" 

497 

498 processed_content = self.comma_citation_pattern.sub( 

499 replace_comma_citations, processed_content 

500 ) 

501 

502 # Then convert individual citations to Quarto format [@citation] 

503 def replace_citation(match): 

504 citation_num = match.group(1) 

505 return f"[@ref{citation_num}]" 

506 

507 processed_content = self.citation_pattern.sub( 

508 replace_citation, processed_content 

509 ) 

510 

511 # Generate bibliography file content 

512 bib_content = self._generate_bibliography(content) 

513 

514 # Add note about bibliography file 

515 bibliography_note = ( 

516 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n" 

517 + bib_content 

518 + "\n```\n:::\n" 

519 ) 

520 

521 return yaml_header + processed_content + bibliography_note 

522 

523 def _generate_bibliography(self, content: str) -> str: 

524 """Generate BibTeX bibliography from sources.""" 

525 sources_pattern = re.compile( 

526 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE 

527 ) 

528 

529 bibliography = "" 

530 matches = list(sources_pattern.finditer(content)) 

531 

532 for match in matches: 

533 citation_num = match.group(1) 

534 title = match.group(2).strip() 

535 url = match.group(3).strip() if match.group(3) else "" 

536 

537 # Generate BibTeX entry 

538 bib_entry = f"@misc{{ref{citation_num},\n" 

539 bib_entry += f' title = "{{{title}}}",\n' 

540 if url: 

541 bib_entry += f" url = {{{url}}},\n" 

542 bib_entry += f' howpublished = "\\url{{{url}}}",\n' 

543 bib_entry += f" year = {{{2024}}},\n" 

544 bib_entry += ' note = "Accessed: \\today"\n' 

545 bib_entry += "}\n" 

546 

547 bibliography += bib_entry + "\n" 

548 

549 return bibliography.strip() 

550 

551 

552class RISExporter: 

553 """Export references to RIS format for reference managers like Zotero.""" 

554 

555 def __init__(self): 

556 self.sources_pattern = re.compile( 

557 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", 

558 re.MULTILINE, 

559 ) 

560 

561 def export_to_ris(self, content: str) -> str: 

562 """ 

563 Extract references from markdown and convert to RIS format. 

564 

565 Args: 

566 content: Markdown content with sources 

567 

568 Returns: 

569 RIS formatted references 

570 """ 

571 # Find sources section 

572 sources_start = find_sources_section(content) 

573 if sources_start == -1: 

574 return "" 

575 

576 # Find the end of the first sources section (before any other major section) 

577 sources_content = content[sources_start:] 

578 

579 # Look for the next major section to avoid duplicates 

580 next_section_markers = [ 

581 "\n## ALL SOURCES", 

582 "\n### ALL SOURCES", 

583 "\n## Research Metrics", 

584 "\n### Research Metrics", 

585 "\n## SEARCH QUESTIONS", 

586 "\n### SEARCH QUESTIONS", 

587 "\n## DETAILED FINDINGS", 

588 "\n### DETAILED FINDINGS", 

589 "\n---", # Horizontal rule often separates sections 

590 ] 

591 

592 sources_end = len(sources_content) 

593 for marker in next_section_markers: 

594 pos = sources_content.find(marker) 

595 if pos != -1 and pos < sources_end: 

596 sources_end = pos 

597 

598 sources_content = sources_content[:sources_end] 

599 

600 # Parse sources and generate RIS entries 

601 ris_entries = [] 

602 seen_refs = set() # Track which references we've already processed 

603 

604 # Split sources into individual entries 

605 import re 

606 

607 # Pattern to match each source entry 

608 source_entry_pattern = re.compile( 

609 r"^\[(\d+)\]\s*(.+?)(?=^\[\d+\]|\Z)", re.MULTILINE | re.DOTALL 

610 ) 

611 

612 for match in source_entry_pattern.finditer(sources_content): 

613 citation_num = match.group(1) 

614 entry_text = match.group(2).strip() 

615 

616 # Extract the title (first line) 

617 lines = entry_text.split("\n") 

618 title = lines[0].strip() 

619 

620 # Extract URL, DOI, and other metadata from subsequent lines 

621 url = "" 

622 metadata = {} 

623 for line in lines[1:]: 

624 line = line.strip() 

625 if line.startswith("URL:"): 

626 url = line[4:].strip() 

627 elif line.startswith("DOI:"): 

628 metadata["doi"] = line[4:].strip() 

629 elif line.startswith("Published in"): 

630 metadata["journal"] = line[12:].strip() 

631 # Add more metadata parsing as needed 

632 elif line: 632 ↛ 623line 632 didn't jump to line 623 because the condition on line 632 was always true

633 # Store other lines as additional metadata 

634 if "additional" not in metadata: 634 ↛ 636line 634 didn't jump to line 636 because the condition on line 634 was always true

635 metadata["additional"] = [] 

636 metadata["additional"].append(line) 

637 

638 # Combine title with additional metadata lines for full context 

639 full_text = entry_text 

640 

641 # Create a unique key to avoid duplicates 

642 ref_key = (citation_num, title, url) 

643 if ref_key not in seen_refs: 643 ↛ 612line 643 didn't jump to line 612 because the condition on line 643 was always true

644 seen_refs.add(ref_key) 

645 # Create RIS entry with full text for metadata extraction 

646 ris_entry = self._create_ris_entry( 

647 citation_num, full_text, url, metadata 

648 ) 

649 ris_entries.append(ris_entry) 

650 

651 return "\n".join(ris_entries) 

652 

653 def _create_ris_entry( 

654 self, ref_id: str, full_text: str, url: str = "", metadata: dict = None 

655 ) -> str: 

656 """Create a single RIS entry.""" 

657 lines = [] 

658 

659 # Parse metadata from full text 

660 import re 

661 

662 if metadata is None: 

663 metadata = {} 

664 

665 # Extract title from first line 

666 lines = full_text.split("\n") 

667 title = lines[0].strip() 

668 

669 # Extract year from full text (looks for 4-digit year) 

670 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text) 

671 year = year_match.group(1) if year_match else None 

672 

673 # Extract authors if present (looks for "by Author1, Author2") 

674 authors_match = re.search( 

675 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE 

676 ) 

677 authors = [] 

678 if authors_match: 

679 authors_text = authors_match.group(1) 

680 # Split by 'and' or ',' 

681 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text) 

682 authors = [a.strip() for a in author_parts if a.strip()] 

683 

684 # Extract DOI from metadata or text 

685 doi = metadata.get("doi") 

686 if not doi: 

687 doi_match = re.search( 

688 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE 

689 ) 

690 doi = doi_match.group(1) if doi_match else None 

691 

692 # Clean title - remove author and metadata info for cleaner title 

693 clean_title = title 

694 if authors_match and authors_match.start() < len(title): 

695 clean_title = ( 

696 title[: authors_match.start()] + title[authors_match.end() :] 

697 if authors_match.end() < len(title) 

698 else title[: authors_match.start()] 

699 ) 

700 clean_title = re.sub( 

701 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE 

702 ) 

703 clean_title = re.sub( 

704 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE 

705 ) 

706 clean_title = re.sub( 

707 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE 

708 ) 

709 clean_title = re.sub( 

710 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE 

711 ) 

712 clean_title = clean_title.strip() 

713 

714 # TY - Type of reference (ELEC for electronic source/website) 

715 lines.append("TY - ELEC") 

716 

717 # ID - Reference ID 

718 lines.append(f"ID - ref{ref_id}") 

719 

720 # TI - Title 

721 lines.append(f"TI - {clean_title if clean_title else title}") 

722 

723 # AU - Authors 

724 for author in authors: 

725 lines.append(f"AU - {author}") 

726 

727 # DO - DOI 

728 if doi: 

729 lines.append(f"DO - {doi}") 

730 

731 # PY - Publication year (if found in title) 

732 if year: 

733 lines.append(f"PY - {year}") 

734 

735 # UR - URL 

736 if url: 

737 lines.append(f"UR - {url}") 

738 

739 # Try to extract domain as publisher 

740 try: 

741 from urllib.parse import urlparse 

742 

743 parsed = urlparse(url) 

744 domain = parsed.netloc 

745 if domain.startswith("www."): 

746 domain = domain[4:] 

747 # Extract readable publisher name from domain 

748 if domain == "github.com" or domain.endswith(".github.com"): 

749 lines.append("PB - GitHub") 

750 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"): 

751 lines.append("PB - arXiv") 

752 elif domain == "reddit.com" or domain.endswith(".reddit.com"): 

753 lines.append("PB - Reddit") 

754 elif ( 754 ↛ 759line 754 didn't jump to line 759 because the condition on line 754 was never true

755 domain == "youtube.com" 

756 or domain == "m.youtube.com" 

757 or domain.endswith(".youtube.com") 

758 ): 

759 lines.append("PB - YouTube") 

760 elif domain == "medium.com" or domain.endswith(".medium.com"): 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 lines.append("PB - Medium") 

762 elif domain == "pypi.org" or domain.endswith(".pypi.org"): 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true

763 lines.append("PB - Python Package Index (PyPI)") 

764 else: 

765 # Use domain as publisher 

766 lines.append(f"PB - {domain}") 

767 except (ValueError, AttributeError): 

768 pass 

769 

770 # Y1 - Year accessed (current year) 

771 from datetime import datetime, UTC 

772 

773 current_year = datetime.now(UTC).year 

774 lines.append(f"Y1 - {current_year}") 

775 

776 # DA - Date accessed 

777 current_date = datetime.now(UTC).strftime("%Y/%m/%d") 

778 lines.append(f"DA - {current_date}") 

779 

780 # LA - Language 

781 lines.append("LA - en") 

782 

783 # ER - End of reference 

784 lines.append("ER - ") 

785 

786 return "\n".join(lines) 

787 

788 

789class LaTeXExporter: 

790 """Export markdown documents to LaTeX format.""" 

791 

792 def __init__(self): 

793 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate 

794 self.citation_pattern = re.compile(r"[\[【](\d+)[\]】]") 

795 self.heading_patterns = [ 

796 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"), 

797 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"), 

798 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"), 

799 ] 

800 self.emphasis_patterns = [ 

801 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"), 

802 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"), 

803 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"), 

804 ] 

805 

806 def export_to_latex(self, content: str) -> str: 

807 """ 

808 Convert markdown document to LaTeX format. 

809 

810 Args: 

811 content: Markdown content 

812 

813 Returns: 

814 LaTeX formatted content 

815 """ 

816 latex_content = self._create_latex_header() 

817 

818 # Convert markdown to LaTeX 

819 body_content = content 

820 

821 # Escape special LaTeX characters but preserve math mode 

822 # Split by $ to preserve math sections 

823 parts = body_content.split("$") 

824 for i in range(len(parts)): 

825 # Even indices are outside math mode 

826 if i % 2 == 0: 

827 # Only escape if not inside $$ 

828 if not ( 

829 i > 0 

830 and parts[i - 1] == "" 

831 and i < len(parts) - 1 

832 and parts[i + 1] == "" 

833 ): 

834 # Preserve certain patterns that will be processed later 

835 # like headings (#), emphasis (*), and citations ([n]) 

836 lines = parts[i].split("\n") 

837 for j, line in enumerate(lines): 

838 # Don't escape lines that start with # (headings) 

839 if not line.strip().startswith("#"): 

840 # Don't escape emphasis markers or citations for now 

841 # They'll be handled by their own patterns 

842 temp_line = line 

843 # Escape special chars except *, #, [, ] 

844 temp_line = temp_line.replace("&", r"\&") 

845 temp_line = temp_line.replace("%", r"\%") 

846 temp_line = temp_line.replace("_", r"\_") 

847 # Don't escape { } inside citations 

848 lines[j] = temp_line 

849 parts[i] = "\n".join(lines) 

850 body_content = "$".join(parts) 

851 

852 # Convert headings 

853 for pattern, replacement in self.heading_patterns: 

854 body_content = pattern.sub(replacement, body_content) 

855 

856 # Convert emphasis 

857 for pattern, replacement in self.emphasis_patterns: 

858 body_content = pattern.sub(replacement, body_content) 

859 

860 # Convert citations to LaTeX \cite{} format 

861 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content) 

862 

863 # Convert lists 

864 body_content = self._convert_lists(body_content) 

865 

866 # Add body content 

867 latex_content += body_content 

868 

869 # Add bibliography section 

870 latex_content += self._create_bibliography(content) 

871 

872 # Add footer 

873 latex_content += self._create_latex_footer() 

874 

875 return latex_content 

876 

877 def _create_latex_header(self) -> str: 

878 """Create LaTeX document header.""" 

879 return r"""\documentclass[12pt]{article} 

880\usepackage[utf8]{inputenc} 

881\usepackage{hyperref} 

882\usepackage{cite} 

883\usepackage{url} 

884 

885\title{Research Report} 

886\date{\today} 

887 

888\begin{document} 

889\maketitle 

890 

891""" 

892 

893 def _create_latex_footer(self) -> str: 

894 """Create LaTeX document footer.""" 

895 return "\n\\end{document}\n" 

896 

897 def _escape_latex(self, text: str) -> str: 

898 """Escape special LaTeX characters in text.""" 

899 # Escape special LaTeX characters 

900 replacements = [ 

901 ("\\", r"\textbackslash{}"), # Must be first 

902 ("&", r"\&"), 

903 ("%", r"\%"), 

904 ("$", r"\$"), 

905 ("#", r"\#"), 

906 ("_", r"\_"), 

907 ("{", r"\{"), 

908 ("}", r"\}"), 

909 ("~", r"\textasciitilde{}"), 

910 ("^", r"\textasciicircum{}"), 

911 ] 

912 

913 for old, new in replacements: 

914 text = text.replace(old, new) 

915 

916 return text 

917 

918 def _convert_lists(self, content: str) -> str: 

919 """Convert markdown lists to LaTeX format.""" 

920 # Simple conversion for bullet points 

921 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE) 

922 

923 # Add itemize environment around list items 

924 lines = content.split("\n") 

925 result = [] 

926 in_list = False 

927 

928 for line in lines: 

929 if line.strip().startswith("\\item"): 

930 if not in_list: 

931 result.append("\\begin{itemize}") 

932 in_list = True 

933 result.append(line) 

934 else: 

935 if in_list and line.strip(): 

936 result.append("\\end{itemize}") 

937 in_list = False 

938 result.append(line) 

939 

940 if in_list: 

941 result.append("\\end{itemize}") 

942 

943 return "\n".join(result) 

944 

945 def _create_bibliography(self, content: str) -> str: 

946 """Extract sources and create LaTeX bibliography.""" 

947 sources_start = find_sources_section(content) 

948 if sources_start == -1: 

949 return "" 

950 

951 sources_content = content[sources_start:] 

952 pattern = re.compile( 

953 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE 

954 ) 

955 

956 bibliography = "\n\\begin{thebibliography}{99}\n" 

957 

958 for match in pattern.finditer(sources_content): 

959 citation_num = match.group(1) 

960 title = match.group(2).strip() 

961 url = match.group(3).strip() if match.group(3) else "" 

962 

963 # Escape special LaTeX characters in title 

964 escaped_title = self._escape_latex(title) 

965 

966 if url: 

967 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n" 

968 else: 

969 bibliography += ( 

970 f"\\bibitem{{{citation_num}}} {escaped_title}.\n" 

971 ) 

972 

973 bibliography += "\\end{thebibliography}\n" 

974 

975 return bibliography