Coverage for src / local_deep_research / text_optimization / citation_formatter.py: 98%
398 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Citation formatter for adding hyperlinks and alternative citation styles."""
3import re
4from enum import Enum
5from typing import Any, Dict, Tuple
6from urllib.parse import urlparse
8_SOURCES_SECTION_PATTERNS = [
9 re.compile(
10 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)",
11 re.MULTILINE | re.IGNORECASE,
12 ),
13 re.compile(
14 r"^(?:Sources|References|Bibliography|Citations):?\s*$",
15 re.MULTILINE | re.IGNORECASE,
16 ),
17]
20def find_sources_section(content: str) -> int:
21 """Find the start position of the sources/references section in *content*.
23 Returns -1 if no section is found.
24 """
25 for pattern in _SOURCES_SECTION_PATTERNS:
26 match = pattern.search(content)
27 if match:
28 return match.start()
29 return -1
32class CitationMode(Enum):
33 """Available citation formatting modes."""
35 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks
36 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks
37 DOMAIN_ID_HYPERLINKS = (
38 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs
39 )
40 DOMAIN_ID_ALWAYS_HYPERLINKS = (
41 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs
42 )
43 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks
46class CitationFormatter:
47 """Formats citations in markdown documents with various styles."""
49 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS):
50 self.mode = mode
51 # Use negative lookbehind and lookahead to avoid matching already formatted citations
52 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
53 self.citation_pattern = re.compile(
54 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"
55 )
56 self.comma_citation_pattern = re.compile(
57 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"
58 )
59 # Also match "Source X" or "source X" patterns
60 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b")
61 self.sources_pattern = re.compile(
62 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
63 re.MULTILINE,
64 )
66 def _create_source_word_replacer(self, formatter_func):
67 """Create a replacement function for 'Source X' patterns.
69 Args:
70 formatter_func: A function that takes citation_num and returns formatted text
72 Returns:
73 A replacement function for use with regex sub
74 """
76 def replace_source_word(match):
77 citation_num = match.group(1)
78 return formatter_func(citation_num)
80 return replace_source_word
82 def _create_citation_formatter(self, sources_dict, format_pattern):
83 """Create a formatter function for citations.
85 Args:
86 sources_dict: Dictionary mapping citation numbers to data
87 format_pattern: A callable that takes (citation_num, data) and returns formatted string
89 Returns:
90 A function that formats citations or returns fallback
91 """
93 def formatter(citation_num):
94 if citation_num in sources_dict:
95 data = sources_dict[citation_num]
96 return format_pattern(citation_num, data)
97 return f"[{citation_num}]"
99 return formatter
101 def _replace_comma_citations(self, content, lookup, format_one):
102 """Replace comma-separated citations like [1, 2, 3] using *lookup* and *format_one*.
104 Args:
105 content: Text to process
106 lookup: Dict mapping citation number (str) to data
107 format_one: ``(num, data) -> str`` callback that formats a single citation
108 """
110 def _replacer(match):
111 nums = [n.strip() for n in match.group(1).split(",")]
112 parts = []
113 for num in nums:
114 if num in lookup:
115 parts.append(format_one(num, lookup[num]))
116 else:
117 parts.append(f"[{num}]")
118 return "".join(parts)
120 return self.comma_citation_pattern.sub(_replacer, content)
122 def format_document(self, content: str) -> str:
123 """
124 Format citations in the document according to the selected mode.
126 Args:
127 content: The markdown content to format
129 Returns:
130 Formatted markdown content
131 """
132 if self.mode == CitationMode.NO_HYPERLINKS:
133 return content
135 # Extract sources section
136 sources_start = self._find_sources_section(content)
137 if sources_start == -1:
138 return content
140 document_content = content[:sources_start]
141 sources_content = content[sources_start:]
143 # Parse sources
144 sources = self._parse_sources(sources_content)
146 # Format citations in document
147 if self.mode == CitationMode.NUMBER_HYPERLINKS:
148 formatted_content = self._format_number_hyperlinks(
149 document_content, sources
150 )
151 elif self.mode == CitationMode.DOMAIN_HYPERLINKS:
152 formatted_content = self._format_domain_hyperlinks(
153 document_content, sources
154 )
155 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS:
156 formatted_content = self._format_domain_id_hyperlinks(
157 document_content, sources
158 )
159 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true
160 formatted_content = self._format_domain_id_always_hyperlinks(
161 document_content, sources
162 )
163 else:
164 formatted_content = document_content
166 # Rebuild document
167 return formatted_content + sources_content
169 def _find_sources_section(self, content: str) -> int:
170 """Find the start of the sources/references section."""
171 return find_sources_section(content)
173 def _parse_sources(
174 self, sources_content: str
175 ) -> Dict[str, Tuple[str, str]]:
176 """
177 Parse sources section to extract citation numbers, titles, and URLs.
179 Returns:
180 Dictionary mapping citation number to (title, url) tuple
181 """
182 sources = {}
183 matches = list(self.sources_pattern.finditer(sources_content))
185 for match in matches:
186 citation_nums_str = match.group(1)
187 title = match.group(2).strip()
188 url = match.group(3).strip() if match.group(3) else ""
190 # Handle comma-separated citation numbers like [36, 3]
191 # Split by comma and strip whitespace
192 individual_nums = [
193 num.strip() for num in citation_nums_str.split(",")
194 ]
196 # Add an entry for each individual number
197 for num in individual_nums:
198 sources[num] = (title, url)
200 return sources
202 def _format_number_hyperlinks(
203 self, content: str, sources: Dict[str, Tuple[str, str]]
204 ) -> str:
205 """Replace [1] with hyperlinked version where only the number is linked."""
206 # Filter sources that have URLs
207 url_sources = {
208 num: (title, url) for num, (title, url) in sources.items() if url
209 }
211 # Create formatter for citations with number hyperlinks
212 def format_number_link(citation_num, data):
213 _, url = data
214 return f"[[{citation_num}]]({url})"
216 # Handle comma-separated citations like [1, 2, 3]
217 content = self._replace_comma_citations(
218 content, url_sources, format_number_link
219 )
221 formatter = self._create_citation_formatter(
222 url_sources, format_number_link
223 )
225 # Handle individual citations
226 def replace_citation(match):
227 return (
228 formatter(match.group(1))
229 if match.group(1) in url_sources
230 else match.group(0)
231 )
233 content = self.citation_pattern.sub(replace_citation, content)
235 # Also handle "Source X" patterns
236 return self.source_word_pattern.sub(
237 self._create_source_word_replacer(formatter), content
238 )
240 def _format_domain_hyperlinks(
241 self, content: str, sources: Dict[str, Tuple[str, str]]
242 ) -> str:
243 """Replace [1] with [domain.com] hyperlinked version."""
245 # Filter sources that have URLs
246 url_sources = {
247 num: (title, url) for num, (title, url) in sources.items() if url
248 }
250 # Create formatter for citations with domain hyperlinks
251 def format_domain_link(citation_num, data):
252 _, url = data
253 domain = self._extract_domain(url)
254 return f"[[{domain}]]({url})"
256 # Handle comma-separated citations like [1, 2, 3]
257 content = self._replace_comma_citations(
258 content, url_sources, format_domain_link
259 )
261 formatter = self._create_citation_formatter(
262 url_sources, format_domain_link
263 )
265 # Handle individual citations
266 def replace_citation(match):
267 return (
268 formatter(match.group(1))
269 if match.group(1) in url_sources
270 else match.group(0)
271 )
273 content = self.citation_pattern.sub(replace_citation, content)
275 # Also handle "Source X" patterns
276 return self.source_word_pattern.sub(
277 self._create_source_word_replacer(formatter), content
278 )
280 def _format_domain_id_hyperlinks(
281 self, content: str, sources: Dict[str, Tuple[str, str]]
282 ) -> str:
283 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs."""
284 # First, create a mapping of domains to their citation numbers
285 domain_citations: dict[str, list[Any]] = {}
287 for citation_num, (title, url) in sources.items():
288 if url: 288 ↛ 287line 288 didn't jump to line 287 because the condition on line 288 was always true
289 domain = self._extract_domain(url)
290 if domain not in domain_citations:
291 domain_citations[domain] = []
292 domain_citations[domain].append((citation_num, url))
294 # Create a mapping from citation number to domain with ID
295 citation_to_domain_id = {}
296 for domain, citations in domain_citations.items():
297 if len(citations) > 1:
298 # Multiple citations from same domain - add hyphen and number
299 for idx, (citation_num, url) in enumerate(citations, 1):
300 citation_to_domain_id[citation_num] = (
301 f"{domain}-{idx}",
302 url,
303 )
304 else:
305 # Single citation from domain - no ID needed
306 citation_num, url = citations[0]
307 citation_to_domain_id[citation_num] = (domain, url)
309 # Create formatter for citations with domain_id hyperlinks
310 def format_domain_id_link(citation_num, data):
311 domain_id, url = data
312 return f"[[{domain_id}]]({url})"
314 # Handle comma-separated citations
315 content = self._replace_comma_citations(
316 content, citation_to_domain_id, format_domain_id_link
317 )
319 formatter = self._create_citation_formatter(
320 citation_to_domain_id, format_domain_id_link
321 )
323 # Handle individual citations
324 def replace_citation(match):
325 return (
326 formatter(match.group(1))
327 if match.group(1) in citation_to_domain_id
328 else match.group(0)
329 )
331 content = self.citation_pattern.sub(replace_citation, content)
333 # Also handle "Source X" patterns
334 return self.source_word_pattern.sub(
335 self._create_source_word_replacer(formatter), content
336 )
338 def _format_domain_id_always_hyperlinks(
339 self, content: str, sources: Dict[str, Tuple[str, str]]
340 ) -> str:
341 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs."""
342 # First, create a mapping of domains to their citation numbers
343 domain_citations: dict[str, list[Any]] = {}
345 for citation_num, (title, url) in sources.items():
346 if url: 346 ↛ 345line 346 didn't jump to line 345 because the condition on line 346 was always true
347 domain = self._extract_domain(url)
348 if domain not in domain_citations:
349 domain_citations[domain] = []
350 domain_citations[domain].append((citation_num, url))
352 # Create a mapping from citation number to domain with ID
353 citation_to_domain_id = {}
354 for domain, citations in domain_citations.items():
355 # Always add hyphen and number for consistency
356 for idx, (citation_num, url) in enumerate(citations, 1):
357 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url)
359 # Create formatter for citations with domain_id hyperlinks
360 def format_domain_id_link(citation_num, data):
361 domain_id, url = data
362 return f"[[{domain_id}]]({url})"
364 # Handle comma-separated citations
365 content = self._replace_comma_citations(
366 content, citation_to_domain_id, format_domain_id_link
367 )
369 formatter = self._create_citation_formatter(
370 citation_to_domain_id, format_domain_id_link
371 )
373 # Handle individual citations
374 def replace_citation(match):
375 return (
376 formatter(match.group(1))
377 if match.group(1) in citation_to_domain_id
378 else match.group(0)
379 )
381 content = self.citation_pattern.sub(replace_citation, content)
383 # Also handle "Source X" patterns
384 return self.source_word_pattern.sub(
385 self._create_source_word_replacer(formatter), content
386 )
388 def _to_superscript(self, text: str) -> str:
389 """Convert text to Unicode superscript."""
390 superscript_map = {
391 "0": "⁰",
392 "1": "¹",
393 "2": "²",
394 "3": "³",
395 "4": "⁴",
396 "5": "⁵",
397 "6": "⁶",
398 "7": "⁷",
399 "8": "⁸",
400 "9": "⁹",
401 }
402 return "".join(superscript_map.get(c, c) for c in text)
404 def _extract_domain(self, url: str) -> str:
405 """Extract domain name from URL."""
406 try:
407 parsed = urlparse(url)
408 domain = parsed.netloc
409 # Remove www. prefix if present
410 if domain.startswith("www."):
411 domain = domain[4:]
412 # Keep known domains as-is
413 known_domains = {
414 "arxiv.org": "arxiv.org",
415 "github.com": "github.com",
416 "reddit.com": "reddit.com",
417 "youtube.com": "youtube.com",
418 "pypi.org": "pypi.org",
419 "milvus.io": "milvus.io",
420 "medium.com": "medium.com",
421 }
423 for known, display in known_domains.items():
424 if known in domain:
425 return display
427 # For other domains, extract main domain
428 parts = domain.split(".")
429 if len(parts) >= 2:
430 return ".".join(parts[-2:])
431 return domain
432 except (ValueError, AttributeError):
433 return "source"
436class QuartoExporter:
437 """Export markdown documents to Quarto (.qmd) format."""
439 def __init__(self):
440 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
441 self.citation_pattern = re.compile(
442 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"
443 )
444 self.comma_citation_pattern = re.compile(
445 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"
446 )
448 def export_to_quarto(self, content: str, title: str | None = None) -> str:
449 """
450 Convert markdown document to Quarto format.
452 Args:
453 content: Markdown content
454 title: Document title (if None, will extract from content)
456 Returns:
457 Quarto formatted content
458 """
459 # Extract title from markdown if not provided
460 if not title:
461 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
462 title = title_match.group(1) if title_match else "Research Report"
464 # Create Quarto YAML header
465 from datetime import datetime, UTC
467 current_date = datetime.now(UTC).strftime("%Y-%m-%d")
468 yaml_header = f"""---
469title: "{title}"
470author: "Local Deep Research"
471date: "{current_date}"
472format:
473 html:
474 toc: true
475 toc-depth: 3
476 number-sections: true
477 pdf:
478 toc: true
479 number-sections: true
480 colorlinks: true
481bibliography: references.bib
482csl: apa.csl
483---
485"""
487 # Process content
488 processed_content = content
490 # First handle comma-separated citations like [1, 2, 3]
491 def replace_comma_citations(match):
492 citation_nums = match.group(1)
493 # Split by comma and strip whitespace
494 nums = [num.strip() for num in citation_nums.split(",")]
495 refs = [f"@ref{num}" for num in nums]
496 return f"[{', '.join(refs)}]"
498 processed_content = self.comma_citation_pattern.sub(
499 replace_comma_citations, processed_content
500 )
502 # Then convert individual citations to Quarto format [@citation]
503 def replace_citation(match):
504 citation_num = match.group(1)
505 return f"[@ref{citation_num}]"
507 processed_content = self.citation_pattern.sub(
508 replace_citation, processed_content
509 )
511 # Generate bibliography file content
512 bib_content = self._generate_bibliography(content)
514 # Add note about bibliography file
515 bibliography_note = (
516 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n"
517 + bib_content
518 + "\n```\n:::\n"
519 )
521 return yaml_header + processed_content + bibliography_note
523 def _generate_bibliography(self, content: str) -> str:
524 """Generate BibTeX bibliography from sources."""
525 sources_pattern = re.compile(
526 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
527 )
529 bibliography = ""
530 matches = list(sources_pattern.finditer(content))
532 for match in matches:
533 citation_num = match.group(1)
534 title = match.group(2).strip()
535 url = match.group(3).strip() if match.group(3) else ""
537 # Generate BibTeX entry
538 bib_entry = f"@misc{{ref{citation_num},\n"
539 bib_entry += f' title = "{{{title}}}",\n'
540 if url:
541 bib_entry += f" url = {{{url}}},\n"
542 bib_entry += f' howpublished = "\\url{{{url}}}",\n'
543 bib_entry += f" year = {{{2024}}},\n"
544 bib_entry += ' note = "Accessed: \\today"\n'
545 bib_entry += "}\n"
547 bibliography += bib_entry + "\n"
549 return bibliography.strip()
552class RISExporter:
553 """Export references to RIS format for reference managers like Zotero."""
555 def __init__(self):
556 self.sources_pattern = re.compile(
557 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
558 re.MULTILINE,
559 )
561 def export_to_ris(self, content: str) -> str:
562 """
563 Extract references from markdown and convert to RIS format.
565 Args:
566 content: Markdown content with sources
568 Returns:
569 RIS formatted references
570 """
571 # Find sources section
572 sources_start = find_sources_section(content)
573 if sources_start == -1:
574 return ""
576 # Find the end of the first sources section (before any other major section)
577 sources_content = content[sources_start:]
579 # Look for the next major section to avoid duplicates
580 next_section_markers = [
581 "\n## ALL SOURCES",
582 "\n### ALL SOURCES",
583 "\n## Research Metrics",
584 "\n### Research Metrics",
585 "\n## SEARCH QUESTIONS",
586 "\n### SEARCH QUESTIONS",
587 "\n## DETAILED FINDINGS",
588 "\n### DETAILED FINDINGS",
589 "\n---", # Horizontal rule often separates sections
590 ]
592 sources_end = len(sources_content)
593 for marker in next_section_markers:
594 pos = sources_content.find(marker)
595 if pos != -1 and pos < sources_end:
596 sources_end = pos
598 sources_content = sources_content[:sources_end]
600 # Parse sources and generate RIS entries
601 ris_entries = []
602 seen_refs = set() # Track which references we've already processed
604 # Split sources into individual entries
605 import re
607 # Pattern to match each source entry
608 source_entry_pattern = re.compile(
609 r"^\[(\d+)\]\s*(.+?)(?=^\[\d+\]|\Z)", re.MULTILINE | re.DOTALL
610 )
612 for match in source_entry_pattern.finditer(sources_content):
613 citation_num = match.group(1)
614 entry_text = match.group(2).strip()
616 # Extract the title (first line)
617 lines = entry_text.split("\n")
618 title = lines[0].strip()
620 # Extract URL, DOI, and other metadata from subsequent lines
621 url = ""
622 metadata = {}
623 for line in lines[1:]:
624 line = line.strip()
625 if line.startswith("URL:"):
626 url = line[4:].strip()
627 elif line.startswith("DOI:"):
628 metadata["doi"] = line[4:].strip()
629 elif line.startswith("Published in"):
630 metadata["journal"] = line[12:].strip()
631 # Add more metadata parsing as needed
632 elif line: 632 ↛ 623line 632 didn't jump to line 623 because the condition on line 632 was always true
633 # Store other lines as additional metadata
634 if "additional" not in metadata: 634 ↛ 636line 634 didn't jump to line 636 because the condition on line 634 was always true
635 metadata["additional"] = []
636 additional = metadata["additional"]
637 if isinstance(additional, list): 637 ↛ 623line 637 didn't jump to line 623 because the condition on line 637 was always true
638 additional.append(line)
640 # Combine title with additional metadata lines for full context
641 full_text = entry_text
643 # Create a unique key to avoid duplicates
644 ref_key = (citation_num, title, url)
645 if ref_key not in seen_refs: 645 ↛ 612line 645 didn't jump to line 612 because the condition on line 645 was always true
646 seen_refs.add(ref_key)
647 # Create RIS entry with full text for metadata extraction
648 ris_entry = self._create_ris_entry(
649 citation_num, full_text, url, metadata
650 )
651 ris_entries.append(ris_entry)
653 return "\n".join(ris_entries)
655 def _create_ris_entry(
656 self,
657 ref_id: str,
658 full_text: str,
659 url: str = "",
660 metadata: dict | None = None,
661 ) -> str:
662 """Create a single RIS entry."""
663 lines = []
665 # Parse metadata from full text
666 import re
668 if metadata is None:
669 metadata = {}
671 # Extract title from first line
672 lines = full_text.split("\n")
673 title = lines[0].strip()
675 # Extract year from full text (looks for 4-digit year)
676 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text)
677 year = year_match.group(1) if year_match else None
679 # Extract authors if present (looks for "by Author1, Author2")
680 authors_match = re.search(
681 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE
682 )
683 authors = []
684 if authors_match:
685 authors_text = authors_match.group(1)
686 # Split by 'and' or ','
687 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text)
688 authors = [a.strip() for a in author_parts if a.strip()]
690 # Extract DOI from metadata or text
691 doi = metadata.get("doi")
692 if not doi:
693 doi_match = re.search(
694 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE
695 )
696 doi = doi_match.group(1) if doi_match else None
698 # Clean title - remove author and metadata info for cleaner title
699 clean_title = title
700 if authors_match and authors_match.start() < len(title):
701 clean_title = (
702 title[: authors_match.start()] + title[authors_match.end() :]
703 if authors_match.end() < len(title)
704 else title[: authors_match.start()]
705 )
706 clean_title = re.sub(
707 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE
708 )
709 clean_title = re.sub(
710 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE
711 )
712 clean_title = re.sub(
713 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE
714 )
715 clean_title = re.sub(
716 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE
717 )
718 clean_title = clean_title.strip()
720 # TY - Type of reference (ELEC for electronic source/website)
721 lines.append("TY - ELEC")
723 # ID - Reference ID
724 lines.append(f"ID - ref{ref_id}")
726 # TI - Title
727 lines.append(f"TI - {clean_title if clean_title else title}")
729 # AU - Authors
730 for author in authors:
731 lines.append(f"AU - {author}")
733 # DO - DOI
734 if doi:
735 lines.append(f"DO - {doi}")
737 # PY - Publication year (if found in title)
738 if year:
739 lines.append(f"PY - {year}")
741 # UR - URL
742 if url:
743 lines.append(f"UR - {url}")
745 # Try to extract domain as publisher
746 try:
747 from urllib.parse import urlparse
749 parsed = urlparse(url)
750 domain = parsed.netloc
751 if domain.startswith("www."):
752 domain = domain[4:]
753 # Extract readable publisher name from domain
754 if domain == "github.com" or domain.endswith(".github.com"):
755 lines.append("PB - GitHub")
756 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"):
757 lines.append("PB - arXiv")
758 elif domain == "reddit.com" or domain.endswith(".reddit.com"):
759 lines.append("PB - Reddit")
760 elif (
761 domain == "youtube.com"
762 or domain == "m.youtube.com"
763 or domain.endswith(".youtube.com")
764 ):
765 lines.append("PB - YouTube")
766 elif domain == "medium.com" or domain.endswith(".medium.com"):
767 lines.append("PB - Medium")
768 elif domain == "pypi.org" or domain.endswith(".pypi.org"):
769 lines.append("PB - Python Package Index (PyPI)")
770 else:
771 # Use domain as publisher
772 lines.append(f"PB - {domain}")
773 except (ValueError, AttributeError):
774 pass
776 # Y1 - Year accessed (current year)
777 from datetime import datetime, UTC
779 current_year = datetime.now(UTC).year
780 lines.append(f"Y1 - {current_year}")
782 # DA - Date accessed
783 current_date = datetime.now(UTC).strftime("%Y/%m/%d")
784 lines.append(f"DA - {current_date}")
786 # LA - Language
787 lines.append("LA - en")
789 # ER - End of reference
790 lines.append("ER - ")
792 return "\n".join(lines)
795class LaTeXExporter:
796 """Export markdown documents to LaTeX format."""
798 def __init__(self):
799 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
800 self.citation_pattern = re.compile(r"[\[【](\d+)[\]】]")
801 self.heading_patterns = [
802 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"),
803 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"),
804 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"),
805 ]
806 self.emphasis_patterns = [
807 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"),
808 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"),
809 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"),
810 ]
812 def export_to_latex(self, content: str) -> str:
813 """
814 Convert markdown document to LaTeX format.
816 Args:
817 content: Markdown content
819 Returns:
820 LaTeX formatted content
821 """
822 latex_content = self._create_latex_header()
824 # Convert markdown to LaTeX
825 body_content = content
827 # Escape special LaTeX characters but preserve math mode
828 # Split by $ to preserve math sections
829 parts = body_content.split("$")
830 for i in range(len(parts)):
831 # Even indices are outside math mode
832 if i % 2 == 0:
833 # Only escape if not inside $$
834 if not (
835 i > 0
836 and parts[i - 1] == ""
837 and i < len(parts) - 1
838 and parts[i + 1] == ""
839 ):
840 # Preserve certain patterns that will be processed later
841 # like headings (#), emphasis (*), and citations ([n])
842 lines = parts[i].split("\n")
843 for j, line in enumerate(lines):
844 # Don't escape lines that start with # (headings)
845 if not line.strip().startswith("#"):
846 # Don't escape emphasis markers or citations for now
847 # They'll be handled by their own patterns
848 temp_line = line
849 # Escape special chars except *, #, [, ]
850 temp_line = temp_line.replace("&", r"\&")
851 temp_line = temp_line.replace("%", r"\%")
852 temp_line = temp_line.replace("_", r"\_")
853 # Don't escape { } inside citations
854 lines[j] = temp_line
855 parts[i] = "\n".join(lines)
856 body_content = "$".join(parts)
858 # Convert headings
859 for pattern, replacement in self.heading_patterns:
860 body_content = pattern.sub(replacement, body_content)
862 # Convert emphasis
863 for pattern, replacement in self.emphasis_patterns:
864 body_content = pattern.sub(replacement, body_content)
866 # Convert citations to LaTeX \cite{} format
867 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content)
869 # Convert lists
870 body_content = self._convert_lists(body_content)
872 # Add body content
873 latex_content += body_content
875 # Add bibliography section
876 latex_content += self._create_bibliography(content)
878 # Add footer
879 latex_content += self._create_latex_footer()
881 return latex_content
883 def _create_latex_header(self) -> str:
884 """Create LaTeX document header."""
885 return r"""\documentclass[12pt]{article}
886\usepackage[utf8]{inputenc}
887\usepackage{hyperref}
888\usepackage{cite}
889\usepackage{url}
891\title{Research Report}
892\date{\today}
894\begin{document}
895\maketitle
897"""
899 def _create_latex_footer(self) -> str:
900 """Create LaTeX document footer."""
901 return "\n\\end{document}\n"
903 def _escape_latex(self, text: str) -> str:
904 """Escape special LaTeX characters in text."""
905 # Escape special LaTeX characters
906 replacements = [
907 ("\\", r"\textbackslash{}"), # Must be first
908 ("&", r"\&"),
909 ("%", r"\%"),
910 ("$", r"\$"),
911 ("#", r"\#"),
912 ("_", r"\_"),
913 ("{", r"\{"),
914 ("}", r"\}"),
915 ("~", r"\textasciitilde{}"),
916 ("^", r"\textasciicircum{}"),
917 ]
919 for old, new in replacements:
920 text = text.replace(old, new)
922 return text
924 def _convert_lists(self, content: str) -> str:
925 """Convert markdown lists to LaTeX format."""
926 # Simple conversion for bullet points
927 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE)
929 # Add itemize environment around list items
930 lines = content.split("\n")
931 result = []
932 in_list = False
934 for line in lines:
935 if line.strip().startswith("\\item"):
936 if not in_list:
937 result.append("\\begin{itemize}")
938 in_list = True
939 result.append(line)
940 else:
941 if in_list and line.strip():
942 result.append("\\end{itemize}")
943 in_list = False
944 result.append(line)
946 if in_list:
947 result.append("\\end{itemize}")
949 return "\n".join(result)
951 def _create_bibliography(self, content: str) -> str:
952 """Extract sources and create LaTeX bibliography."""
953 sources_start = find_sources_section(content)
954 if sources_start == -1:
955 return ""
957 sources_content = content[sources_start:]
958 pattern = re.compile(
959 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
960 )
962 bibliography = "\n\\begin{thebibliography}{99}\n"
964 for match in pattern.finditer(sources_content):
965 citation_num = match.group(1)
966 title = match.group(2).strip()
967 url = match.group(3).strip() if match.group(3) else ""
969 # Escape special LaTeX characters in title
970 escaped_title = self._escape_latex(title)
972 if url:
973 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n"
974 else:
975 bibliography += (
976 f"\\bibitem{{{citation_num}}} {escaped_title}.\n"
977 )
979 bibliography += "\\end{thebibliography}\n"
981 return bibliography