Coverage for src / local_deep_research / text_optimization / citation_formatter.py: 93%

433 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1"""Citation formatter for adding hyperlinks and alternative citation styles.""" 

2 

3import re 

4from enum import Enum 

5from typing import Dict, Tuple 

6from urllib.parse import urlparse 

7 

8 

9class CitationMode(Enum): 

10 """Available citation formatting modes.""" 

11 

12 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks 

13 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks 

14 DOMAIN_ID_HYPERLINKS = ( 

15 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs 

16 ) 

17 DOMAIN_ID_ALWAYS_HYPERLINKS = ( 

18 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs 

19 ) 

20 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks 

21 

22 

23class CitationFormatter: 

24 """Formats citations in markdown documents with various styles.""" 

25 

26 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS): 

27 self.mode = mode 

28 # Use negative lookbehind and lookahead to avoid matching already formatted citations 

29 self.citation_pattern = re.compile(r"(?<!\[)\[(\d+)\](?!\])") 

30 self.comma_citation_pattern = re.compile(r"\[(\d+(?:,\s*\d+)+)\]") 

31 # Also match "Source X" or "source X" patterns 

32 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b") 

33 self.sources_pattern = re.compile( 

34 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", 

35 re.MULTILINE, 

36 ) 

37 

38 def _create_source_word_replacer(self, formatter_func): 

39 """Create a replacement function for 'Source X' patterns. 

40 

41 Args: 

42 formatter_func: A function that takes citation_num and returns formatted text 

43 

44 Returns: 

45 A replacement function for use with regex sub 

46 """ 

47 

48 def replace_source_word(match): 

49 citation_num = match.group(1) 

50 return formatter_func(citation_num) 

51 

52 return replace_source_word 

53 

54 def _create_citation_formatter(self, sources_dict, format_pattern): 

55 """Create a formatter function for citations. 

56 

57 Args: 

58 sources_dict: Dictionary mapping citation numbers to data 

59 format_pattern: A callable that takes (citation_num, data) and returns formatted string 

60 

61 Returns: 

62 A function that formats citations or returns fallback 

63 """ 

64 

65 def formatter(citation_num): 

66 if citation_num in sources_dict: 

67 data = sources_dict[citation_num] 

68 return format_pattern(citation_num, data) 

69 return f"[{citation_num}]" 

70 

71 return formatter 

72 

73 def format_document(self, content: str) -> str: 

74 """ 

75 Format citations in the document according to the selected mode. 

76 

77 Args: 

78 content: The markdown content to format 

79 

80 Returns: 

81 Formatted markdown content 

82 """ 

83 if self.mode == CitationMode.NO_HYPERLINKS: 

84 return content 

85 

86 # Extract sources section 

87 sources_start = self._find_sources_section(content) 

88 if sources_start == -1: 

89 return content 

90 

91 document_content = content[:sources_start] 

92 sources_content = content[sources_start:] 

93 

94 # Parse sources 

95 sources = self._parse_sources(sources_content) 

96 

97 # Format citations in document 

98 if self.mode == CitationMode.NUMBER_HYPERLINKS: 

99 formatted_content = self._format_number_hyperlinks( 

100 document_content, sources 

101 ) 

102 elif self.mode == CitationMode.DOMAIN_HYPERLINKS: 

103 formatted_content = self._format_domain_hyperlinks( 

104 document_content, sources 

105 ) 

106 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS: 

107 formatted_content = self._format_domain_id_hyperlinks( 

108 document_content, sources 

109 ) 

110 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 110 ↛ 115line 110 didn't jump to line 115 because the condition on line 110 was always true

111 formatted_content = self._format_domain_id_always_hyperlinks( 

112 document_content, sources 

113 ) 

114 else: 

115 formatted_content = document_content 

116 

117 # Rebuild document 

118 return formatted_content + sources_content 

119 

120 def _find_sources_section(self, content: str) -> int: 

121 """Find the start of the sources/references section.""" 

122 patterns = [ 

123 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)", 

124 r"^(?:Sources|References|Bibliography|Citations):?\s*$", 

125 ] 

126 

127 for pattern in patterns: 

128 match = re.search(pattern, content, re.MULTILINE | re.IGNORECASE) 

129 if match: 

130 return match.start() 

131 

132 return -1 

133 

134 def _parse_sources( 

135 self, sources_content: str 

136 ) -> Dict[str, Tuple[str, str]]: 

137 """ 

138 Parse sources section to extract citation numbers, titles, and URLs. 

139 

140 Returns: 

141 Dictionary mapping citation number to (title, url) tuple 

142 """ 

143 sources = {} 

144 matches = list(self.sources_pattern.finditer(sources_content)) 

145 

146 for match in matches: 

147 citation_nums_str = match.group(1) 

148 title = match.group(2).strip() 

149 url = match.group(3).strip() if match.group(3) else "" 

150 

151 # Handle comma-separated citation numbers like [36, 3] 

152 # Split by comma and strip whitespace 

153 individual_nums = [ 

154 num.strip() for num in citation_nums_str.split(",") 

155 ] 

156 

157 # Add an entry for each individual number 

158 for num in individual_nums: 

159 sources[num] = (title, url) 

160 

161 return sources 

162 

163 def _format_number_hyperlinks( 

164 self, content: str, sources: Dict[str, Tuple[str, str]] 

165 ) -> str: 

166 """Replace [1] with hyperlinked version where only the number is linked.""" 

167 

168 # First handle comma-separated citations like [1, 2, 3] 

169 def replace_comma_citations(match): 

170 citation_nums = match.group(1) 

171 # Split by comma and strip whitespace 

172 nums = [num.strip() for num in citation_nums.split(",")] 

173 formatted_citations = [] 

174 

175 for num in nums: 

176 if num in sources and sources[num][1]: 176 ↛ 180line 176 didn't jump to line 180 because the condition on line 176 was always true

177 url = sources[num][1] 

178 formatted_citations.append(f"[[{num}]]({url})") 

179 else: 

180 formatted_citations.append(f"[{num}]") 

181 

182 return "".join(formatted_citations) 

183 

184 content = self.comma_citation_pattern.sub( 

185 replace_comma_citations, content 

186 ) 

187 

188 # Filter sources that have URLs 

189 url_sources = { 

190 num: (title, url) for num, (title, url) in sources.items() if url 

191 } 

192 

193 # Create formatter for citations with number hyperlinks 

194 def format_number_link(citation_num, data): 

195 _, url = data 

196 return f"[[{citation_num}]]({url})" 

197 

198 formatter = self._create_citation_formatter( 

199 url_sources, format_number_link 

200 ) 

201 

202 # Handle individual citations 

203 def replace_citation(match): 

204 return ( 

205 formatter(match.group(1)) 

206 if match.group(1) in url_sources 

207 else match.group(0) 

208 ) 

209 

210 content = self.citation_pattern.sub(replace_citation, content) 

211 

212 # Also handle "Source X" patterns 

213 return self.source_word_pattern.sub( 

214 self._create_source_word_replacer(formatter), content 

215 ) 

216 

217 def _format_domain_hyperlinks( 

218 self, content: str, sources: Dict[str, Tuple[str, str]] 

219 ) -> str: 

220 """Replace [1] with [domain.com] hyperlinked version.""" 

221 

222 # First handle comma-separated citations like [1, 2, 3] 

223 def replace_comma_citations(match): 

224 citation_nums = match.group(1) 

225 # Split by comma and strip whitespace 

226 nums = [num.strip() for num in citation_nums.split(",")] 

227 formatted_citations = [] 

228 

229 for num in nums: 

230 if num in sources and sources[num][1]: 230 ↛ 235line 230 didn't jump to line 235 because the condition on line 230 was always true

231 url = sources[num][1] 

232 domain = self._extract_domain(url) 

233 formatted_citations.append(f"[[{domain}]]({url})") 

234 else: 

235 formatted_citations.append(f"[{num}]") 

236 

237 return "".join(formatted_citations) 

238 

239 content = self.comma_citation_pattern.sub( 

240 replace_comma_citations, content 

241 ) 

242 

243 # Filter sources that have URLs 

244 url_sources = { 

245 num: (title, url) for num, (title, url) in sources.items() if url 

246 } 

247 

248 # Create formatter for citations with domain hyperlinks 

249 def format_domain_link(citation_num, data): 

250 _, url = data 

251 domain = self._extract_domain(url) 

252 return f"[[{domain}]]({url})" 

253 

254 formatter = self._create_citation_formatter( 

255 url_sources, format_domain_link 

256 ) 

257 

258 # Handle individual citations 

259 def replace_citation(match): 

260 return ( 

261 formatter(match.group(1)) 

262 if match.group(1) in url_sources 

263 else match.group(0) 

264 ) 

265 

266 content = self.citation_pattern.sub(replace_citation, content) 

267 

268 # Also handle "Source X" patterns 

269 return self.source_word_pattern.sub( 

270 self._create_source_word_replacer(formatter), content 

271 ) 

272 

273 def _format_domain_id_hyperlinks( 

274 self, content: str, sources: Dict[str, Tuple[str, str]] 

275 ) -> str: 

276 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs.""" 

277 # First, create a mapping of domains to their citation numbers 

278 domain_citations = {} 

279 

280 for citation_num, (title, url) in sources.items(): 

281 if url: 281 ↛ 280line 281 didn't jump to line 280 because the condition on line 281 was always true

282 domain = self._extract_domain(url) 

283 if domain not in domain_citations: 

284 domain_citations[domain] = [] 

285 domain_citations[domain].append((citation_num, url)) 

286 

287 # Create a mapping from citation number to domain with ID 

288 citation_to_domain_id = {} 

289 for domain, citations in domain_citations.items(): 

290 if len(citations) > 1: 

291 # Multiple citations from same domain - add hyphen and number 

292 for idx, (citation_num, url) in enumerate(citations, 1): 

293 citation_to_domain_id[citation_num] = ( 

294 f"{domain}-{idx}", 

295 url, 

296 ) 

297 else: 

298 # Single citation from domain - no ID needed 

299 citation_num, url = citations[0] 

300 citation_to_domain_id[citation_num] = (domain, url) 

301 

302 # First handle comma-separated citations 

303 def replace_comma_citations(match): 

304 citation_nums = match.group(1) 

305 nums = [num.strip() for num in citation_nums.split(",")] 

306 formatted_citations = [] 

307 

308 for num in nums: 

309 if num in citation_to_domain_id: 

310 domain_id, url = citation_to_domain_id[num] 

311 formatted_citations.append(f"[[{domain_id}]]({url})") 

312 else: 

313 formatted_citations.append(f"[{num}]") 

314 

315 return "".join(formatted_citations) 

316 

317 content = self.comma_citation_pattern.sub( 

318 replace_comma_citations, content 

319 ) 

320 

321 # Create formatter for citations with domain_id hyperlinks 

322 def format_domain_id_link(citation_num, data): 

323 domain_id, url = data 

324 return f"[[{domain_id}]]({url})" 

325 

326 formatter = self._create_citation_formatter( 

327 citation_to_domain_id, format_domain_id_link 

328 ) 

329 

330 # Handle individual citations 

331 def replace_citation(match): 

332 return ( 

333 formatter(match.group(1)) 

334 if match.group(1) in citation_to_domain_id 

335 else match.group(0) 

336 ) 

337 

338 content = self.citation_pattern.sub(replace_citation, content) 

339 

340 # Also handle "Source X" patterns 

341 return self.source_word_pattern.sub( 

342 self._create_source_word_replacer(formatter), content 

343 ) 

344 

345 def _format_domain_id_always_hyperlinks( 

346 self, content: str, sources: Dict[str, Tuple[str, str]] 

347 ) -> str: 

348 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs.""" 

349 # First, create a mapping of domains to their citation numbers 

350 domain_citations = {} 

351 

352 for citation_num, (title, url) in sources.items(): 

353 if url: 353 ↛ 352line 353 didn't jump to line 352 because the condition on line 353 was always true

354 domain = self._extract_domain(url) 

355 if domain not in domain_citations: 

356 domain_citations[domain] = [] 

357 domain_citations[domain].append((citation_num, url)) 

358 

359 # Create a mapping from citation number to domain with ID 

360 citation_to_domain_id = {} 

361 for domain, citations in domain_citations.items(): 

362 # Always add hyphen and number for consistency 

363 for idx, (citation_num, url) in enumerate(citations, 1): 

364 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url) 

365 

366 # First handle comma-separated citations 

367 def replace_comma_citations(match): 

368 citation_nums = match.group(1) 

369 nums = [num.strip() for num in citation_nums.split(",")] 

370 formatted_citations = [] 

371 

372 for num in nums: 

373 if num in citation_to_domain_id: 373 ↛ 377line 373 didn't jump to line 377 because the condition on line 373 was always true

374 domain_id, url = citation_to_domain_id[num] 

375 formatted_citations.append(f"[[{domain_id}]]({url})") 

376 else: 

377 formatted_citations.append(f"[{num}]") 

378 

379 return "".join(formatted_citations) 

380 

381 content = self.comma_citation_pattern.sub( 

382 replace_comma_citations, content 

383 ) 

384 

385 # Create formatter for citations with domain_id hyperlinks 

386 def format_domain_id_link(citation_num, data): 

387 domain_id, url = data 

388 return f"[[{domain_id}]]({url})" 

389 

390 formatter = self._create_citation_formatter( 

391 citation_to_domain_id, format_domain_id_link 

392 ) 

393 

394 # Handle individual citations 

395 def replace_citation(match): 

396 return ( 

397 formatter(match.group(1)) 

398 if match.group(1) in citation_to_domain_id 

399 else match.group(0) 

400 ) 

401 

402 content = self.citation_pattern.sub(replace_citation, content) 

403 

404 # Also handle "Source X" patterns 

405 return self.source_word_pattern.sub( 

406 self._create_source_word_replacer(formatter), content 

407 ) 

408 

409 def _to_superscript(self, text: str) -> str: 

410 """Convert text to Unicode superscript.""" 

411 superscript_map = { 

412 "0": "⁰", 

413 "1": "¹", 

414 "2": "²", 

415 "3": "³", 

416 "4": "⁴", 

417 "5": "⁵", 

418 "6": "⁶", 

419 "7": "⁷", 

420 "8": "⁸", 

421 "9": "⁹", 

422 } 

423 return "".join(superscript_map.get(c, c) for c in text) 

424 

425 def _extract_domain(self, url: str) -> str: 

426 """Extract domain name from URL.""" 

427 try: 

428 parsed = urlparse(url) 

429 domain = parsed.netloc 

430 # Remove www. prefix if present 

431 if domain.startswith("www."): 

432 domain = domain[4:] 

433 # Keep known domains as-is 

434 known_domains = { 

435 "arxiv.org": "arxiv.org", 

436 "github.com": "github.com", 

437 "reddit.com": "reddit.com", 

438 "youtube.com": "youtube.com", 

439 "pypi.org": "pypi.org", 

440 "milvus.io": "milvus.io", 

441 "medium.com": "medium.com", 

442 } 

443 

444 for known, display in known_domains.items(): 

445 if known in domain: 

446 return display 

447 

448 # For other domains, extract main domain 

449 parts = domain.split(".") 

450 if len(parts) >= 2: 

451 return ".".join(parts[-2:]) 

452 return domain 

453 except: 

454 return "source" 

455 

456 

457class QuartoExporter: 

458 """Export markdown documents to Quarto (.qmd) format.""" 

459 

460 def __init__(self): 

461 self.citation_pattern = re.compile(r"(?<!\[)\[(\d+)\](?!\])") 

462 self.comma_citation_pattern = re.compile(r"\[(\d+(?:,\s*\d+)+)\]") 

463 

464 def export_to_quarto(self, content: str, title: str = None) -> str: 

465 """ 

466 Convert markdown document to Quarto format. 

467 

468 Args: 

469 content: Markdown content 

470 title: Document title (if None, will extract from content) 

471 

472 Returns: 

473 Quarto formatted content 

474 """ 

475 # Extract title from markdown if not provided 

476 if not title: 

477 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) 

478 title = title_match.group(1) if title_match else "Research Report" 

479 

480 # Create Quarto YAML header 

481 from datetime import datetime, UTC 

482 

483 current_date = datetime.now(UTC).strftime("%Y-%m-%d") 

484 yaml_header = f"""--- 

485title: "{title}" 

486author: "Local Deep Research" 

487date: "{current_date}" 

488format: 

489 html: 

490 toc: true 

491 toc-depth: 3 

492 number-sections: true 

493 pdf: 

494 toc: true 

495 number-sections: true 

496 colorlinks: true 

497bibliography: references.bib 

498csl: apa.csl 

499--- 

500 

501""" 

502 

503 # Process content 

504 processed_content = content 

505 

506 # First handle comma-separated citations like [1, 2, 3] 

507 def replace_comma_citations(match): 

508 citation_nums = match.group(1) 

509 # Split by comma and strip whitespace 

510 nums = [num.strip() for num in citation_nums.split(",")] 

511 refs = [f"@ref{num}" for num in nums] 

512 return f"[{', '.join(refs)}]" 

513 

514 processed_content = self.comma_citation_pattern.sub( 

515 replace_comma_citations, processed_content 

516 ) 

517 

518 # Then convert individual citations to Quarto format [@citation] 

519 def replace_citation(match): 

520 citation_num = match.group(1) 

521 return f"[@ref{citation_num}]" 

522 

523 processed_content = self.citation_pattern.sub( 

524 replace_citation, processed_content 

525 ) 

526 

527 # Generate bibliography file content 

528 bib_content = self._generate_bibliography(content) 

529 

530 # Add note about bibliography file 

531 bibliography_note = ( 

532 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n" 

533 + bib_content 

534 + "\n```\n:::\n" 

535 ) 

536 

537 return yaml_header + processed_content + bibliography_note 

538 

539 def _generate_bibliography(self, content: str) -> str: 

540 """Generate BibTeX bibliography from sources.""" 

541 sources_pattern = re.compile( 

542 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE 

543 ) 

544 

545 bibliography = "" 

546 matches = list(sources_pattern.finditer(content)) 

547 

548 for match in matches: 

549 citation_num = match.group(1) 

550 title = match.group(2).strip() 

551 url = match.group(3).strip() if match.group(3) else "" 

552 

553 # Generate BibTeX entry 

554 bib_entry = f"@misc{{ref{citation_num},\n" 

555 bib_entry += f' title = "{{{title}}}",\n' 

556 if url: 

557 bib_entry += f" url = {{{url}}},\n" 

558 bib_entry += f' howpublished = "\\url{{{url}}}",\n' 

559 bib_entry += f" year = {{{2024}}},\n" 

560 bib_entry += ' note = "Accessed: \\today"\n' 

561 bib_entry += "}\n" 

562 

563 bibliography += bib_entry + "\n" 

564 

565 return bibliography.strip() 

566 

567 

568class RISExporter: 

569 """Export references to RIS format for reference managers like Zotero.""" 

570 

571 def __init__(self): 

572 self.sources_pattern = re.compile( 

573 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", 

574 re.MULTILINE, 

575 ) 

576 

577 def export_to_ris(self, content: str) -> str: 

578 """ 

579 Extract references from markdown and convert to RIS format. 

580 

581 Args: 

582 content: Markdown content with sources 

583 

584 Returns: 

585 RIS formatted references 

586 """ 

587 # Find sources section 

588 sources_start = content.find("## Sources") 

589 if sources_start == -1: 

590 sources_start = content.find("## References") 

591 if sources_start == -1: 

592 sources_start = content.find("### Sources") 

593 if sources_start == -1: 

594 sources_start = content.find("### SOURCES") 

595 

596 if sources_start == -1: 

597 return "" 

598 

599 # Find the end of the first sources section (before any other major section) 

600 sources_content = content[sources_start:] 

601 

602 # Look for the next major section to avoid duplicates 

603 next_section_markers = [ 

604 "\n## ALL SOURCES", 

605 "\n### ALL SOURCES", 

606 "\n## Research Metrics", 

607 "\n### Research Metrics", 

608 "\n## SEARCH QUESTIONS", 

609 "\n### SEARCH QUESTIONS", 

610 "\n## DETAILED FINDINGS", 

611 "\n### DETAILED FINDINGS", 

612 "\n---", # Horizontal rule often separates sections 

613 ] 

614 

615 sources_end = len(sources_content) 

616 for marker in next_section_markers: 

617 pos = sources_content.find(marker) 

618 if pos != -1 and pos < sources_end: 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true

619 sources_end = pos 

620 

621 sources_content = sources_content[:sources_end] 

622 

623 # Parse sources and generate RIS entries 

624 ris_entries = [] 

625 seen_refs = set() # Track which references we've already processed 

626 

627 # Split sources into individual entries 

628 import re 

629 

630 # Pattern to match each source entry 

631 source_entry_pattern = re.compile( 

632 r"^\[(\d+)\]\s*(.+?)(?=^\[\d+\]|\Z)", re.MULTILINE | re.DOTALL 

633 ) 

634 

635 for match in source_entry_pattern.finditer(sources_content): 

636 citation_num = match.group(1) 

637 entry_text = match.group(2).strip() 

638 

639 # Extract the title (first line) 

640 lines = entry_text.split("\n") 

641 title = lines[0].strip() 

642 

643 # Extract URL, DOI, and other metadata from subsequent lines 

644 url = "" 

645 metadata = {} 

646 for line in lines[1:]: 

647 line = line.strip() 

648 if line.startswith("URL:"): 

649 url = line[4:].strip() 

650 elif line.startswith("DOI:"): 

651 metadata["doi"] = line[4:].strip() 

652 elif line.startswith("Published in"): 

653 metadata["journal"] = line[12:].strip() 

654 # Add more metadata parsing as needed 

655 elif line: 655 ↛ 646line 655 didn't jump to line 646 because the condition on line 655 was always true

656 # Store other lines as additional metadata 

657 if "additional" not in metadata: 657 ↛ 659line 657 didn't jump to line 659 because the condition on line 657 was always true

658 metadata["additional"] = [] 

659 metadata["additional"].append(line) 

660 

661 # Combine title with additional metadata lines for full context 

662 full_text = entry_text 

663 

664 # Create a unique key to avoid duplicates 

665 ref_key = (citation_num, title, url) 

666 if ref_key not in seen_refs: 666 ↛ 635line 666 didn't jump to line 635 because the condition on line 666 was always true

667 seen_refs.add(ref_key) 

668 # Create RIS entry with full text for metadata extraction 

669 ris_entry = self._create_ris_entry( 

670 citation_num, full_text, url, metadata 

671 ) 

672 ris_entries.append(ris_entry) 

673 

674 return "\n".join(ris_entries) 

675 

676 def _create_ris_entry( 

677 self, ref_id: str, full_text: str, url: str = "", metadata: dict = None 

678 ) -> str: 

679 """Create a single RIS entry.""" 

680 lines = [] 

681 

682 # Parse metadata from full text 

683 import re 

684 

685 if metadata is None: 685 ↛ 686line 685 didn't jump to line 686 because the condition on line 685 was never true

686 metadata = {} 

687 

688 # Extract title from first line 

689 lines = full_text.split("\n") 

690 title = lines[0].strip() 

691 

692 # Extract year from full text (looks for 4-digit year) 

693 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text) 

694 year = year_match.group(1) if year_match else None 

695 

696 # Extract authors if present (looks for "by Author1, Author2") 

697 authors_match = re.search( 

698 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE 

699 ) 

700 authors = [] 

701 if authors_match: 

702 authors_text = authors_match.group(1) 

703 # Split by 'and' or ',' 

704 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text) 

705 authors = [a.strip() for a in author_parts if a.strip()] 

706 

707 # Extract DOI from metadata or text 

708 doi = metadata.get("doi") 

709 if not doi: 

710 doi_match = re.search( 

711 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE 

712 ) 

713 doi = doi_match.group(1) if doi_match else None 

714 

715 # Clean title - remove author and metadata info for cleaner title 

716 clean_title = title 

717 if authors_match and authors_match.start() < len(title): 

718 clean_title = ( 

719 title[: authors_match.start()] + title[authors_match.end() :] 

720 if authors_match.end() < len(title) 

721 else title[: authors_match.start()] 

722 ) 

723 clean_title = re.sub( 

724 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE 

725 ) 

726 clean_title = re.sub( 

727 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE 

728 ) 

729 clean_title = re.sub( 

730 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE 

731 ) 

732 clean_title = re.sub( 

733 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE 

734 ) 

735 clean_title = clean_title.strip() 

736 

737 # TY - Type of reference (ELEC for electronic source/website) 

738 lines.append("TY - ELEC") 

739 

740 # ID - Reference ID 

741 lines.append(f"ID - ref{ref_id}") 

742 

743 # TI - Title 

744 lines.append(f"TI - {clean_title if clean_title else title}") 

745 

746 # AU - Authors 

747 for author in authors: 

748 lines.append(f"AU - {author}") 

749 

750 # DO - DOI 

751 if doi: 

752 lines.append(f"DO - {doi}") 

753 

754 # PY - Publication year (if found in title) 

755 if year: 

756 lines.append(f"PY - {year}") 

757 

758 # UR - URL 

759 if url: 

760 lines.append(f"UR - {url}") 

761 

762 # Try to extract domain as publisher 

763 try: 

764 from urllib.parse import urlparse 

765 

766 parsed = urlparse(url) 

767 domain = parsed.netloc 

768 if domain.startswith("www."): 

769 domain = domain[4:] 

770 # Extract readable publisher name from domain 

771 if domain == "github.com" or domain.endswith(".github.com"): 

772 lines.append("PB - GitHub") 

773 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"): 

774 lines.append("PB - arXiv") 

775 elif domain == "reddit.com" or domain.endswith(".reddit.com"): 

776 lines.append("PB - Reddit") 

777 elif ( 777 ↛ 782line 777 didn't jump to line 782 because the condition on line 777 was never true

778 domain == "youtube.com" 

779 or domain == "m.youtube.com" 

780 or domain.endswith(".youtube.com") 

781 ): 

782 lines.append("PB - YouTube") 

783 elif domain == "medium.com" or domain.endswith(".medium.com"): 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true

784 lines.append("PB - Medium") 

785 elif domain == "pypi.org" or domain.endswith(".pypi.org"): 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true

786 lines.append("PB - Python Package Index (PyPI)") 

787 else: 

788 # Use domain as publisher 

789 lines.append(f"PB - {domain}") 

790 except: 

791 pass 

792 

793 # Y1 - Year accessed (current year) 

794 from datetime import datetime, UTC 

795 

796 current_year = datetime.now(UTC).year 

797 lines.append(f"Y1 - {current_year}") 

798 

799 # DA - Date accessed 

800 current_date = datetime.now(UTC).strftime("%Y/%m/%d") 

801 lines.append(f"DA - {current_date}") 

802 

803 # LA - Language 

804 lines.append("LA - en") 

805 

806 # ER - End of reference 

807 lines.append("ER - ") 

808 

809 return "\n".join(lines) 

810 

811 

812class LaTeXExporter: 

813 """Export markdown documents to LaTeX format.""" 

814 

815 def __init__(self): 

816 self.citation_pattern = re.compile(r"\[(\d+)\]") 

817 self.heading_patterns = [ 

818 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"), 

819 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"), 

820 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"), 

821 ] 

822 self.emphasis_patterns = [ 

823 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"), 

824 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"), 

825 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"), 

826 ] 

827 

828 def export_to_latex(self, content: str) -> str: 

829 """ 

830 Convert markdown document to LaTeX format. 

831 

832 Args: 

833 content: Markdown content 

834 

835 Returns: 

836 LaTeX formatted content 

837 """ 

838 latex_content = self._create_latex_header() 

839 

840 # Convert markdown to LaTeX 

841 body_content = content 

842 

843 # Escape special LaTeX characters but preserve math mode 

844 # Split by $ to preserve math sections 

845 parts = body_content.split("$") 

846 for i in range(len(parts)): 

847 # Even indices are outside math mode 

848 if i % 2 == 0: 

849 # Only escape if not inside $$ 

850 if not ( 

851 i > 0 

852 and parts[i - 1] == "" 

853 and i < len(parts) - 1 

854 and parts[i + 1] == "" 

855 ): 

856 # Preserve certain patterns that will be processed later 

857 # like headings (#), emphasis (*), and citations ([n]) 

858 lines = parts[i].split("\n") 

859 for j, line in enumerate(lines): 

860 # Don't escape lines that start with # (headings) 

861 if not line.strip().startswith("#"): 

862 # Don't escape emphasis markers or citations for now 

863 # They'll be handled by their own patterns 

864 temp_line = line 

865 # Escape special chars except *, #, [, ] 

866 temp_line = temp_line.replace("&", r"\&") 

867 temp_line = temp_line.replace("%", r"\%") 

868 temp_line = temp_line.replace("_", r"\_") 

869 # Don't escape { } inside citations 

870 lines[j] = temp_line 

871 parts[i] = "\n".join(lines) 

872 body_content = "$".join(parts) 

873 

874 # Convert headings 

875 for pattern, replacement in self.heading_patterns: 

876 body_content = pattern.sub(replacement, body_content) 

877 

878 # Convert emphasis 

879 for pattern, replacement in self.emphasis_patterns: 

880 body_content = pattern.sub(replacement, body_content) 

881 

882 # Convert citations to LaTeX \cite{} format 

883 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content) 

884 

885 # Convert lists 

886 body_content = self._convert_lists(body_content) 

887 

888 # Add body content 

889 latex_content += body_content 

890 

891 # Add bibliography section 

892 latex_content += self._create_bibliography(content) 

893 

894 # Add footer 

895 latex_content += self._create_latex_footer() 

896 

897 return latex_content 

898 

899 def _create_latex_header(self) -> str: 

900 """Create LaTeX document header.""" 

901 return r"""\documentclass[12pt]{article} 

902\usepackage[utf8]{inputenc} 

903\usepackage{hyperref} 

904\usepackage{cite} 

905\usepackage{url} 

906 

907\title{Research Report} 

908\date{\today} 

909 

910\begin{document} 

911\maketitle 

912 

913""" 

914 

915 def _create_latex_footer(self) -> str: 

916 """Create LaTeX document footer.""" 

917 return "\n\\end{document}\n" 

918 

919 def _escape_latex(self, text: str) -> str: 

920 """Escape special LaTeX characters in text.""" 

921 # Escape special LaTeX characters 

922 replacements = [ 

923 ("\\", r"\textbackslash{}"), # Must be first 

924 ("&", r"\&"), 

925 ("%", r"\%"), 

926 ("$", r"\$"), 

927 ("#", r"\#"), 

928 ("_", r"\_"), 

929 ("{", r"\{"), 

930 ("}", r"\}"), 

931 ("~", r"\textasciitilde{}"), 

932 ("^", r"\textasciicircum{}"), 

933 ] 

934 

935 for old, new in replacements: 

936 text = text.replace(old, new) 

937 

938 return text 

939 

940 def _convert_lists(self, content: str) -> str: 

941 """Convert markdown lists to LaTeX format.""" 

942 # Simple conversion for bullet points 

943 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE) 

944 

945 # Add itemize environment around list items 

946 lines = content.split("\n") 

947 result = [] 

948 in_list = False 

949 

950 for line in lines: 

951 if line.strip().startswith("\\item"): 

952 if not in_list: 

953 result.append("\\begin{itemize}") 

954 in_list = True 

955 result.append(line) 

956 else: 

957 if in_list and line.strip(): 

958 result.append("\\end{itemize}") 

959 in_list = False 

960 result.append(line) 

961 

962 if in_list: 962 ↛ 963line 962 didn't jump to line 963 because the condition on line 962 was never true

963 result.append("\\end{itemize}") 

964 

965 return "\n".join(result) 

966 

967 def _create_bibliography(self, content: str) -> str: 

968 """Extract sources and create LaTeX bibliography.""" 

969 sources_start = content.find("## Sources") 

970 if sources_start == -1: 

971 sources_start = content.find("## References") 

972 

973 if sources_start == -1: 

974 return "" 

975 

976 sources_content = content[sources_start:] 

977 pattern = re.compile( 

978 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE 

979 ) 

980 

981 bibliography = "\n\\begin{thebibliography}{99}\n" 

982 

983 for match in pattern.finditer(sources_content): 

984 citation_num = match.group(1) 

985 title = match.group(2).strip() 

986 url = match.group(3).strip() if match.group(3) else "" 

987 

988 # Escape special LaTeX characters in title 

989 escaped_title = self._escape_latex(title) 

990 

991 if url: 

992 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n" 

993 else: 

994 bibliography += ( 

995 f"\\bibitem{{{citation_num}}} {escaped_title}.\n" 

996 ) 

997 

998 bibliography += "\\end{thebibliography}\n" 

999 

1000 return bibliography