Coverage for src/local_deep_research/text_optimization/citation

1"""Citation formatter for adding hyperlinks and alternative citation styles."""

3import re

4from enum import Enum

5from typing import Dict, Tuple

6from urllib.parse import urlparse

8_SOURCES_SECTION_PATTERNS = [

9 re.compile(

10 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)",

11 re.MULTILINE | re.IGNORECASE,

12 ),

13 re.compile(

14 r"^(?:Sources|References|Bibliography|Citations):?\s*$",

15 re.MULTILINE | re.IGNORECASE,

16 ),

17]

20def find_sources_section(content: str) -> int:

21 """Find the start position of the sources/references section in *content*.

23 Returns -1 if no section is found.

24 """

25 for pattern in _SOURCES_SECTION_PATTERNS:

26 match = pattern.search(content)

27 if match:

28 return match.start()

29 return -1

32class CitationMode(Enum):

33 """Available citation formatting modes."""

35 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks

36 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks

37 DOMAIN_ID_HYPERLINKS = (

38 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs

39 )

40 DOMAIN_ID_ALWAYS_HYPERLINKS = (

41 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs

42 )

43 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks

46class CitationFormatter:

47 """Formats citations in markdown documents with various styles."""

49 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS):

50 self.mode = mode

51 # Use negative lookbehind and lookahead to avoid matching already formatted citations

52 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate

53 self.citation_pattern = re.compile(

54 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"

55 )

56 self.comma_citation_pattern = re.compile(

57 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"

58 )

59 # Also match "Source X" or "source X" patterns

60 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b")

61 self.sources_pattern = re.compile(

62 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",

63 re.MULTILINE,

64 )

66 def _create_source_word_replacer(self, formatter_func):

67 """Create a replacement function for 'Source X' patterns.

69 Args:

70 formatter_func: A function that takes citation_num and returns formatted text

72 Returns:

73 A replacement function for use with regex sub

74 """

76 def replace_source_word(match):

77 citation_num = match.group(1)

78 return formatter_func(citation_num)

80 return replace_source_word

82 def _create_citation_formatter(self, sources_dict, format_pattern):

83 """Create a formatter function for citations.

85 Args:

86 sources_dict: Dictionary mapping citation numbers to data

87 format_pattern: A callable that takes (citation_num, data) and returns formatted string

89 Returns:

90 A function that formats citations or returns fallback

91 """

93 def formatter(citation_num):

94 if citation_num in sources_dict:

95 data = sources_dict[citation_num]

96 return format_pattern(citation_num, data)

97 return f"[{citation_num}]"

99 return formatter

100

101 def _replace_comma_citations(self, content, lookup, format_one):

102 """Replace comma-separated citations like [1, 2, 3] using *lookup* and *format_one*.

103

104 Args:

105 content: Text to process

106 lookup: Dict mapping citation number (str) to data

107 format_one: ``(num, data) -> str`` callback that formats a single citation

108 """

109

110 def _replacer(match):

111 nums = [n.strip() for n in match.group(1).split(",")]

112 parts = []

113 for num in nums:

114 if num in lookup:

115 parts.append(format_one(num, lookup[num]))

116 else:

117 parts.append(f"[{num}]")

118 return "".join(parts)

119

120 return self.comma_citation_pattern.sub(_replacer, content)

121

122 def format_document(self, content: str) -> str:

123 """

124 Format citations in the document according to the selected mode.

125

126 Args:

127 content: The markdown content to format

128

129 Returns:

130 Formatted markdown content

131 """

132 if self.mode == CitationMode.NO_HYPERLINKS:

133 return content

134

135 # Extract sources section

136 sources_start = self._find_sources_section(content)

137 if sources_start == -1:

138 return content

139

140 document_content = content[:sources_start]

141 sources_content = content[sources_start:]

142

143 # Parse sources

144 sources = self._parse_sources(sources_content)

145

146 # Format citations in document

147 if self.mode == CitationMode.NUMBER_HYPERLINKS:

148 formatted_content = self._format_number_hyperlinks(

149 document_content, sources

150 )

151 elif self.mode == CitationMode.DOMAIN_HYPERLINKS:

152 formatted_content = self._format_domain_hyperlinks(

153 document_content, sources

154 )

155 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS:

156 formatted_content = self._format_domain_id_hyperlinks(

157 document_content, sources

158 )

159 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true

160 formatted_content = self._format_domain_id_always_hyperlinks(

161 document_content, sources

162 )

163 else:

164 formatted_content = document_content

165

166 # Rebuild document

167 return formatted_content + sources_content

168

169 def _find_sources_section(self, content: str) -> int:

170 """Find the start of the sources/references section."""

171 return find_sources_section(content)

172

173 def _parse_sources(

174 self, sources_content: str

175 ) -> Dict[str, Tuple[str, str]]:

176 """

177 Parse sources section to extract citation numbers, titles, and URLs.

178

179 Returns:

180 Dictionary mapping citation number to (title, url) tuple

181 """

182 sources = {}

183 matches = list(self.sources_pattern.finditer(sources_content))

184

185 for match in matches:

186 citation_nums_str = match.group(1)

187 title = match.group(2).strip()

188 url = match.group(3).strip() if match.group(3) else ""

189

190 # Handle comma-separated citation numbers like [36, 3]

191 # Split by comma and strip whitespace

192 individual_nums = [

193 num.strip() for num in citation_nums_str.split(",")

194 ]

195

196 # Add an entry for each individual number

197 for num in individual_nums:

198 sources[num] = (title, url)

199

200 return sources

201

202 def _format_number_hyperlinks(

203 self, content: str, sources: Dict[str, Tuple[str, str]]

204 ) -> str:

205 """Replace [1] with hyperlinked version where only the number is linked."""

206 # Filter sources that have URLs

207 url_sources = {

208 num: (title, url) for num, (title, url) in sources.items() if url

209 }

210

211 # Create formatter for citations with number hyperlinks

212 def format_number_link(citation_num, data):

213 _, url = data

214 return f"[[{citation_num}]]({url})"

215

216 # Handle comma-separated citations like [1, 2, 3]

217 content = self._replace_comma_citations(

218 content, url_sources, format_number_link

219 )

220

221 formatter = self._create_citation_formatter(

222 url_sources, format_number_link

223 )

224

225 # Handle individual citations

226 def replace_citation(match):

227 return (

228 formatter(match.group(1))

229 if match.group(1) in url_sources

230 else match.group(0)

231 )

232

233 content = self.citation_pattern.sub(replace_citation, content)

234

235 # Also handle "Source X" patterns

236 return self.source_word_pattern.sub(

237 self._create_source_word_replacer(formatter), content

238 )

239

240 def _format_domain_hyperlinks(

241 self, content: str, sources: Dict[str, Tuple[str, str]]

242 ) -> str:

243 """Replace [1] with [domain.com] hyperlinked version."""

244

245 # Filter sources that have URLs

246 url_sources = {

247 num: (title, url) for num, (title, url) in sources.items() if url

248 }

249

250 # Create formatter for citations with domain hyperlinks

251 def format_domain_link(citation_num, data):

252 _, url = data

253 domain = self._extract_domain(url)

254 return f"[[{domain}]]({url})"

255

256 # Handle comma-separated citations like [1, 2, 3]

257 content = self._replace_comma_citations(

258 content, url_sources, format_domain_link

259 )

260

261 formatter = self._create_citation_formatter(

262 url_sources, format_domain_link

263 )

264

265 # Handle individual citations

266 def replace_citation(match):

267 return (

268 formatter(match.group(1))

269 if match.group(1) in url_sources

270 else match.group(0)

271 )

272

273 content = self.citation_pattern.sub(replace_citation, content)

274

275 # Also handle "Source X" patterns

276 return self.source_word_pattern.sub(

277 self._create_source_word_replacer(formatter), content

278 )

279

280 def _format_domain_id_hyperlinks(

281 self, content: str, sources: Dict[str, Tuple[str, str]]

282 ) -> str:

283 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs."""

284 # First, create a mapping of domains to their citation numbers

285 domain_citations = {}

286

287 for citation_num, (title, url) in sources.items():

288 if url: 288 ↛ 287line 288 didn't jump to line 287 because the condition on line 288 was always true

289 domain = self._extract_domain(url)

290 if domain not in domain_citations:

291 domain_citations[domain] = []

292 domain_citations[domain].append((citation_num, url))

293

294 # Create a mapping from citation number to domain with ID

295 citation_to_domain_id = {}

296 for domain, citations in domain_citations.items():

297 if len(citations) > 1:

298 # Multiple citations from same domain - add hyphen and number

299 for idx, (citation_num, url) in enumerate(citations, 1):

300 citation_to_domain_id[citation_num] = (

301 f"{domain}-{idx}",

302 url,

303 )

304 else:

305 # Single citation from domain - no ID needed

306 citation_num, url = citations[0]

307 citation_to_domain_id[citation_num] = (domain, url)

308

309 # Create formatter for citations with domain_id hyperlinks

310 def format_domain_id_link(citation_num, data):

311 domain_id, url = data

312 return f"[[{domain_id}]]({url})"

313

314 # Handle comma-separated citations

315 content = self._replace_comma_citations(

316 content, citation_to_domain_id, format_domain_id_link

317 )

318

319 formatter = self._create_citation_formatter(

320 citation_to_domain_id, format_domain_id_link

321 )

322

323 # Handle individual citations

324 def replace_citation(match):

325 return (

326 formatter(match.group(1))

327 if match.group(1) in citation_to_domain_id

328 else match.group(0)

329 )

330

331 content = self.citation_pattern.sub(replace_citation, content)

332

333 # Also handle "Source X" patterns

334 return self.source_word_pattern.sub(

335 self._create_source_word_replacer(formatter), content

336 )

337

338 def _format_domain_id_always_hyperlinks(

339 self, content: str, sources: Dict[str, Tuple[str, str]]

340 ) -> str:

341 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs."""

342 # First, create a mapping of domains to their citation numbers

343 domain_citations = {}

344

345 for citation_num, (title, url) in sources.items():

346 if url: 346 ↛ 345line 346 didn't jump to line 345 because the condition on line 346 was always true

347 domain = self._extract_domain(url)

348 if domain not in domain_citations:

349 domain_citations[domain] = []

350 domain_citations[domain].append((citation_num, url))

351

352 # Create a mapping from citation number to domain with ID

353 citation_to_domain_id = {}

354 for domain, citations in domain_citations.items():

355 # Always add hyphen and number for consistency

356 for idx, (citation_num, url) in enumerate(citations, 1):

357 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url)

358

359 # Create formatter for citations with domain_id hyperlinks

360 def format_domain_id_link(citation_num, data):

361 domain_id, url = data

362 return f"[[{domain_id}]]({url})"

363

364 # Handle comma-separated citations

365 content = self._replace_comma_citations(

366 content, citation_to_domain_id, format_domain_id_link

367 )

368

369 formatter = self._create_citation_formatter(

370 citation_to_domain_id, format_domain_id_link

371 )

372

373 # Handle individual citations

374 def replace_citation(match):

375 return (

376 formatter(match.group(1))

377 if match.group(1) in citation_to_domain_id

378 else match.group(0)

379 )

380

381 content = self.citation_pattern.sub(replace_citation, content)

382

383 # Also handle "Source X" patterns

384 return self.source_word_pattern.sub(

385 self._create_source_word_replacer(formatter), content

386 )

387

388 def _to_superscript(self, text: str) -> str:

389 """Convert text to Unicode superscript."""

390 superscript_map = {

391 "0": "⁰",

392 "1": "¹",

393 "2": "²",

394 "3": "³",

395 "4": "⁴",

396 "5": "⁵",

397 "6": "⁶",

398 "7": "⁷",

399 "8": "⁸",

400 "9": "⁹",

401 }

402 return "".join(superscript_map.get(c, c) for c in text)

403

404 def _extract_domain(self, url: str) -> str:

405 """Extract domain name from URL."""

406 try:

407 parsed = urlparse(url)

408 domain = parsed.netloc

409 # Remove www. prefix if present

410 if domain.startswith("www."):

411 domain = domain[4:]

412 # Keep known domains as-is

413 known_domains = {

414 "arxiv.org": "arxiv.org",

415 "github.com": "github.com",

416 "reddit.com": "reddit.com",

417 "youtube.com": "youtube.com",

418 "pypi.org": "pypi.org",

419 "milvus.io": "milvus.io",

420 "medium.com": "medium.com",

421 }

422

423 for known, display in known_domains.items():

424 if known in domain:

425 return display

426

427 # For other domains, extract main domain

428 parts = domain.split(".")

429 if len(parts) >= 2:

430 return ".".join(parts[-2:])

431 return domain

432 except (ValueError, AttributeError):

433 return "source"

434

435

436class QuartoExporter:

437 """Export markdown documents to Quarto (.qmd) format."""

438

439 def __init__(self):

440 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate

441 self.citation_pattern = re.compile(

442 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"

443 )

444 self.comma_citation_pattern = re.compile(

445 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"

446 )

447

448 def export_to_quarto(self, content: str, title: str = None) -> str:

449 """

450 Convert markdown document to Quarto format.

451

452 Args:

453 content: Markdown content

454 title: Document title (if None, will extract from content)

455

456 Returns:

457 Quarto formatted content

458 """

459 # Extract title from markdown if not provided

460 if not title:

461 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)

462 title = title_match.group(1) if title_match else "Research Report"

463

464 # Create Quarto YAML header

465 from datetime import datetime, UTC

466

467 current_date = datetime.now(UTC).strftime("%Y-%m-%d")

468 yaml_header = f"""---

469title: "{title}"

470author: "Local Deep Research"

471date: "{current_date}"

472format:

473 html:

474 toc: true

475 toc-depth: 3

476 number-sections: true

477 pdf:

478 toc: true

479 number-sections: true

480 colorlinks: true

481bibliography: references.bib

482csl: apa.csl

483---

484

485"""

486

487 # Process content

488 processed_content = content

489

490 # First handle comma-separated citations like [1, 2, 3]

491 def replace_comma_citations(match):

492 citation_nums = match.group(1)

493 # Split by comma and strip whitespace

494 nums = [num.strip() for num in citation_nums.split(",")]

495 refs = [f"@ref{num}" for num in nums]

496 return f"[{', '.join(refs)}]"

497

498 processed_content = self.comma_citation_pattern.sub(

499 replace_comma_citations, processed_content

500 )

501

502 # Then convert individual citations to Quarto format [@citation]

503 def replace_citation(match):

504 citation_num = match.group(1)

505 return f"[@ref{citation_num}]"

506

507 processed_content = self.citation_pattern.sub(

508 replace_citation, processed_content

509 )

510

511 # Generate bibliography file content

512 bib_content = self._generate_bibliography(content)

513

514 # Add note about bibliography file

515 bibliography_note = (

516 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n"

517 + bib_content

518 + "\n```\n:::\n"

519 )

520

521 return yaml_header + processed_content + bibliography_note

522

523 def _generate_bibliography(self, content: str) -> str:

524 """Generate BibTeX bibliography from sources."""

525 sources_pattern = re.compile(

526 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE

527 )

528

529 bibliography = ""

530 matches = list(sources_pattern.finditer(content))

531

532 for match in matches:

533 citation_num = match.group(1)

534 title = match.group(2).strip()

535 url = match.group(3).strip() if match.group(3) else ""

536

537 # Generate BibTeX entry

538 bib_entry = f"@misc{{ref{citation_num},\n"

539 bib_entry += f' title = "{{{title}}}",\n'

540 if url:

541 bib_entry += f" url = {{{url}}},\n"

542 bib_entry += f' howpublished = "\\url{{{url}}}",\n'

543 bib_entry += f" year = {{{2024}}},\n"

544 bib_entry += ' note = "Accessed: \\today"\n'

545 bib_entry += "}\n"

546

547 bibliography += bib_entry + "\n"

548

549 return bibliography.strip()

550

551

552class RISExporter:

553 """Export references to RIS format for reference managers like Zotero."""

554

555 def __init__(self):

556 self.sources_pattern = re.compile(

557 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",

558 re.MULTILINE,

559 )

560

561 def export_to_ris(self, content: str) -> str:

562 """

563 Extract references from markdown and convert to RIS format.

564

565 Args:

566 content: Markdown content with sources

567

568 Returns:

569 RIS formatted references

570 """

571 # Find sources section

572 sources_start = find_sources_section(content)

573 if sources_start == -1:

574 return ""

575

576 # Find the end of the first sources section (before any other major section)

577 sources_content = content[sources_start:]

578

579 # Look for the next major section to avoid duplicates

580 next_section_markers = [

581 "\n## ALL SOURCES",

582 "\n### ALL SOURCES",

583 "\n## Research Metrics",

584 "\n### Research Metrics",

585 "\n## SEARCH QUESTIONS",

586 "\n### SEARCH QUESTIONS",

587 "\n## DETAILED FINDINGS",

588 "\n### DETAILED FINDINGS",

589 "\n---", # Horizontal rule often separates sections

590 ]

591

592 sources_end = len(sources_content)

593 for marker in next_section_markers:

594 pos = sources_content.find(marker)

595 if pos != -1 and pos < sources_end:

596 sources_end = pos

597

598 sources_content = sources_content[:sources_end]

599

600 # Parse sources and generate RIS entries

601 ris_entries = []

602 seen_refs = set() # Track which references we've already processed

603

604 # Split sources into individual entries

605 import re

606

607 # Pattern to match each source entry

608 source_entry_pattern = re.compile(

609 r"^\[(\d+)\]\s*(.+?)(?=^\[\d+\]|\Z)", re.MULTILINE | re.DOTALL

610 )

611

612 for match in source_entry_pattern.finditer(sources_content):

613 citation_num = match.group(1)

614 entry_text = match.group(2).strip()

615

616 # Extract the title (first line)

617 lines = entry_text.split("\n")

618 title = lines[0].strip()

619

620 # Extract URL, DOI, and other metadata from subsequent lines

621 url = ""

622 metadata = {}

623 for line in lines[1:]:

624 line = line.strip()

625 if line.startswith("URL:"):

626 url = line[4:].strip()

627 elif line.startswith("DOI:"):

628 metadata["doi"] = line[4:].strip()

629 elif line.startswith("Published in"):

630 metadata["journal"] = line[12:].strip()

631 # Add more metadata parsing as needed

632 elif line: 632 ↛ 623line 632 didn't jump to line 623 because the condition on line 632 was always true

633 # Store other lines as additional metadata

634 if "additional" not in metadata: 634 ↛ 636line 634 didn't jump to line 636 because the condition on line 634 was always true

635 metadata["additional"] = []

636 metadata["additional"].append(line)

637

638 # Combine title with additional metadata lines for full context

639 full_text = entry_text

640

641 # Create a unique key to avoid duplicates

642 ref_key = (citation_num, title, url)

643 if ref_key not in seen_refs: 643 ↛ 612line 643 didn't jump to line 612 because the condition on line 643 was always true

644 seen_refs.add(ref_key)

645 # Create RIS entry with full text for metadata extraction

646 ris_entry = self._create_ris_entry(

647 citation_num, full_text, url, metadata

648 )

649 ris_entries.append(ris_entry)

650

651 return "\n".join(ris_entries)

652

653 def _create_ris_entry(

654 self, ref_id: str, full_text: str, url: str = "", metadata: dict = None

655 ) -> str:

656 """Create a single RIS entry."""

657 lines = []

658

659 # Parse metadata from full text

660 import re

661

662 if metadata is None:

663 metadata = {}

664

665 # Extract title from first line

666 lines = full_text.split("\n")

667 title = lines[0].strip()

668

669 # Extract year from full text (looks for 4-digit year)

670 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text)

671 year = year_match.group(1) if year_match else None

672

673 # Extract authors if present (looks for "by Author1, Author2")

674 authors_match = re.search(

675 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE

676 )

677 authors = []

678 if authors_match:

679 authors_text = authors_match.group(1)

680 # Split by 'and' or ','

681 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text)

682 authors = [a.strip() for a in author_parts if a.strip()]

683

684 # Extract DOI from metadata or text

685 doi = metadata.get("doi")

686 if not doi:

687 doi_match = re.search(

688 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE

689 )

690 doi = doi_match.group(1) if doi_match else None

691

692 # Clean title - remove author and metadata info for cleaner title

693 clean_title = title

694 if authors_match and authors_match.start() < len(title):

695 clean_title = (

696 title[: authors_match.start()] + title[authors_match.end() :]

697 if authors_match.end() < len(title)

698 else title[: authors_match.start()]

699 )

700 clean_title = re.sub(

701 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE

702 )

703 clean_title = re.sub(

704 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE

705 )

706 clean_title = re.sub(

707 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE

708 )

709 clean_title = re.sub(

710 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE

711 )

712 clean_title = clean_title.strip()

713

714 # TY - Type of reference (ELEC for electronic source/website)

715 lines.append("TY - ELEC")

716

717 # ID - Reference ID

718 lines.append(f"ID - ref{ref_id}")

719

720 # TI - Title

721 lines.append(f"TI - {clean_title if clean_title else title}")

722

723 # AU - Authors

724 for author in authors:

725 lines.append(f"AU - {author}")

726

727 # DO - DOI

728 if doi:

729 lines.append(f"DO - {doi}")

730

731 # PY - Publication year (if found in title)

732 if year:

733 lines.append(f"PY - {year}")

734

735 # UR - URL

736 if url:

737 lines.append(f"UR - {url}")

738

739 # Try to extract domain as publisher

740 try:

741 from urllib.parse import urlparse

742

743 parsed = urlparse(url)

744 domain = parsed.netloc

745 if domain.startswith("www."):

746 domain = domain[4:]

747 # Extract readable publisher name from domain

748 if domain == "github.com" or domain.endswith(".github.com"):

749 lines.append("PB - GitHub")

750 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"):

751 lines.append("PB - arXiv")

752 elif domain == "reddit.com" or domain.endswith(".reddit.com"):

753 lines.append("PB - Reddit")

754 elif ( 754 ↛ 759line 754 didn't jump to line 759 because the condition on line 754 was never true

755 domain == "youtube.com"

756 or domain == "m.youtube.com"

757 or domain.endswith(".youtube.com")

758 ):

759 lines.append("PB - YouTube")

760 elif domain == "medium.com" or domain.endswith(".medium.com"): 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 lines.append("PB - Medium")

762 elif domain == "pypi.org" or domain.endswith(".pypi.org"): 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true

763 lines.append("PB - Python Package Index (PyPI)")

764 else:

765 # Use domain as publisher

766 lines.append(f"PB - {domain}")

767 except (ValueError, AttributeError):

768 pass

769

770 # Y1 - Year accessed (current year)

771 from datetime import datetime, UTC

772

773 current_year = datetime.now(UTC).year

774 lines.append(f"Y1 - {current_year}")

775

776 # DA - Date accessed

777 current_date = datetime.now(UTC).strftime("%Y/%m/%d")

778 lines.append(f"DA - {current_date}")

779

780 # LA - Language

781 lines.append("LA - en")

782

783 # ER - End of reference

784 lines.append("ER - ")

785

786 return "\n".join(lines)

787

788

789class LaTeXExporter:

790 """Export markdown documents to LaTeX format."""

791

792 def __init__(self):

793 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate

794 self.citation_pattern = re.compile(r"[\[【](\d+)[\]】]")

795 self.heading_patterns = [

796 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"),

797 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"),

798 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"),

799 ]

800 self.emphasis_patterns = [

801 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"),

802 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"),

803 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"),

804 ]

805

806 def export_to_latex(self, content: str) -> str:

807 """

808 Convert markdown document to LaTeX format.

809

810 Args:

811 content: Markdown content

812

813 Returns:

814 LaTeX formatted content

815 """

816 latex_content = self._create_latex_header()

817

818 # Convert markdown to LaTeX

819 body_content = content

820

821 # Escape special LaTeX characters but preserve math mode

822 # Split by $ to preserve math sections

823 parts = body_content.split("$")

824 for i in range(len(parts)):

825 # Even indices are outside math mode

826 if i % 2 == 0:

827 # Only escape if not inside $$

828 if not (

829 i > 0

830 and parts[i - 1] == ""

831 and i < len(parts) - 1

832 and parts[i + 1] == ""

833 ):

834 # Preserve certain patterns that will be processed later

835 # like headings (#), emphasis (*), and citations ([n])

836 lines = parts[i].split("\n")

837 for j, line in enumerate(lines):

838 # Don't escape lines that start with # (headings)

839 if not line.strip().startswith("#"):

840 # Don't escape emphasis markers or citations for now

841 # They'll be handled by their own patterns

842 temp_line = line

843 # Escape special chars except *, #, [, ]

844 temp_line = temp_line.replace("&", r"\&")

845 temp_line = temp_line.replace("%", r"\%")

846 temp_line = temp_line.replace("_", r"\_")

847 # Don't escape { } inside citations

848 lines[j] = temp_line

849 parts[i] = "\n".join(lines)

850 body_content = "$".join(parts)

851

852 # Convert headings

853 for pattern, replacement in self.heading_patterns:

854 body_content = pattern.sub(replacement, body_content)

855

856 # Convert emphasis

857 for pattern, replacement in self.emphasis_patterns:

858 body_content = pattern.sub(replacement, body_content)

859

860 # Convert citations to LaTeX \cite{} format

861 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content)

862

863 # Convert lists

864 body_content = self._convert_lists(body_content)

865

866 # Add body content

867 latex_content += body_content

868

869 # Add bibliography section

870 latex_content += self._create_bibliography(content)

871

872 # Add footer

873 latex_content += self._create_latex_footer()

874

875 return latex_content

876

877 def _create_latex_header(self) -> str:

878 """Create LaTeX document header."""

879 return r"""\documentclass[12pt]{article}

880\usepackage[utf8]{inputenc}

881\usepackage{hyperref}

882\usepackage{cite}

883\usepackage{url}

884

885\title{Research Report}

886\date{\today}

887

888\begin{document}

889\maketitle

890

891"""

892

893 def _create_latex_footer(self) -> str:

894 """Create LaTeX document footer."""

895 return "\n\\end{document}\n"

896

897 def _escape_latex(self, text: str) -> str:

898 """Escape special LaTeX characters in text."""

899 # Escape special LaTeX characters

900 replacements = [

901 ("\\", r"\textbackslash{}"), # Must be first

902 ("&", r"\&"),

903 ("%", r"\%"),

904 ("$", r"\$"),

905 ("#", r"\#"),

906 ("_", r"\_"),

907 ("{", r"\{"),

908 ("}", r"\}"),

909 ("~", r"\textasciitilde{}"),

910 ("^", r"\textasciicircum{}"),

911 ]

912

913 for old, new in replacements:

914 text = text.replace(old, new)

915

916 return text

917

918 def _convert_lists(self, content: str) -> str:

919 """Convert markdown lists to LaTeX format."""

920 # Simple conversion for bullet points

921 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE)

922

923 # Add itemize environment around list items

924 lines = content.split("\n")

925 result = []

926 in_list = False

927

928 for line in lines:

929 if line.strip().startswith("\\item"):

930 if not in_list:

931 result.append("\\begin{itemize}")

932 in_list = True

933 result.append(line)

934 else:

935 if in_list and line.strip():

936 result.append("\\end{itemize}")

937 in_list = False

938 result.append(line)

939

940 if in_list:

941 result.append("\\end{itemize}")

942

943 return "\n".join(result)

944

945 def _create_bibliography(self, content: str) -> str:

946 """Extract sources and create LaTeX bibliography."""

947 sources_start = find_sources_section(content)

948 if sources_start == -1:

949 return ""

950

951 sources_content = content[sources_start:]

952 pattern = re.compile(

953 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE

954 )

955

956 bibliography = "\n\\begin{thebibliography}{99}\n"

957

958 for match in pattern.finditer(sources_content):

959 citation_num = match.group(1)

960 title = match.group(2).strip()

961 url = match.group(3).strip() if match.group(3) else ""

962

963 # Escape special LaTeX characters in title

964 escaped_title = self._escape_latex(title)

965

966 if url:

967 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n"

968 else:

969 bibliography += (

970 f"\\bibitem{{{citation_num}}} {escaped_title}.\n"

971 )

972

973 bibliography += "\\end{thebibliography}\n"

974

975 return bibliography

Coverage for src / local_deep_research / text_optimization / citation_formatter.py: 97%

396 statements