Coverage for src / local_deep_research / text_optimization / citation_formatter.py: 97%
396 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""Citation formatter for adding hyperlinks and alternative citation styles."""
3import re
4from enum import Enum
5from typing import Dict, Tuple
6from urllib.parse import urlparse
8_SOURCES_SECTION_PATTERNS = [
9 re.compile(
10 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)",
11 re.MULTILINE | re.IGNORECASE,
12 ),
13 re.compile(
14 r"^(?:Sources|References|Bibliography|Citations):?\s*$",
15 re.MULTILINE | re.IGNORECASE,
16 ),
17]
20def find_sources_section(content: str) -> int:
21 """Find the start position of the sources/references section in *content*.
23 Returns -1 if no section is found.
24 """
25 for pattern in _SOURCES_SECTION_PATTERNS:
26 match = pattern.search(content)
27 if match:
28 return match.start()
29 return -1
32class CitationMode(Enum):
33 """Available citation formatting modes."""
35 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks
36 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks
37 DOMAIN_ID_HYPERLINKS = (
38 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs
39 )
40 DOMAIN_ID_ALWAYS_HYPERLINKS = (
41 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs
42 )
43 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks
46class CitationFormatter:
47 """Formats citations in markdown documents with various styles."""
49 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS):
50 self.mode = mode
51 # Use negative lookbehind and lookahead to avoid matching already formatted citations
52 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
53 self.citation_pattern = re.compile(
54 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"
55 )
56 self.comma_citation_pattern = re.compile(
57 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"
58 )
59 # Also match "Source X" or "source X" patterns
60 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b")
61 self.sources_pattern = re.compile(
62 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
63 re.MULTILINE,
64 )
66 def _create_source_word_replacer(self, formatter_func):
67 """Create a replacement function for 'Source X' patterns.
69 Args:
70 formatter_func: A function that takes citation_num and returns formatted text
72 Returns:
73 A replacement function for use with regex sub
74 """
76 def replace_source_word(match):
77 citation_num = match.group(1)
78 return formatter_func(citation_num)
80 return replace_source_word
82 def _create_citation_formatter(self, sources_dict, format_pattern):
83 """Create a formatter function for citations.
85 Args:
86 sources_dict: Dictionary mapping citation numbers to data
87 format_pattern: A callable that takes (citation_num, data) and returns formatted string
89 Returns:
90 A function that formats citations or returns fallback
91 """
93 def formatter(citation_num):
94 if citation_num in sources_dict:
95 data = sources_dict[citation_num]
96 return format_pattern(citation_num, data)
97 return f"[{citation_num}]"
99 return formatter
101 def _replace_comma_citations(self, content, lookup, format_one):
102 """Replace comma-separated citations like [1, 2, 3] using *lookup* and *format_one*.
104 Args:
105 content: Text to process
106 lookup: Dict mapping citation number (str) to data
107 format_one: ``(num, data) -> str`` callback that formats a single citation
108 """
110 def _replacer(match):
111 nums = [n.strip() for n in match.group(1).split(",")]
112 parts = []
113 for num in nums:
114 if num in lookup:
115 parts.append(format_one(num, lookup[num]))
116 else:
117 parts.append(f"[{num}]")
118 return "".join(parts)
120 return self.comma_citation_pattern.sub(_replacer, content)
122 def format_document(self, content: str) -> str:
123 """
124 Format citations in the document according to the selected mode.
126 Args:
127 content: The markdown content to format
129 Returns:
130 Formatted markdown content
131 """
132 if self.mode == CitationMode.NO_HYPERLINKS:
133 return content
135 # Extract sources section
136 sources_start = self._find_sources_section(content)
137 if sources_start == -1:
138 return content
140 document_content = content[:sources_start]
141 sources_content = content[sources_start:]
143 # Parse sources
144 sources = self._parse_sources(sources_content)
146 # Format citations in document
147 if self.mode == CitationMode.NUMBER_HYPERLINKS:
148 formatted_content = self._format_number_hyperlinks(
149 document_content, sources
150 )
151 elif self.mode == CitationMode.DOMAIN_HYPERLINKS:
152 formatted_content = self._format_domain_hyperlinks(
153 document_content, sources
154 )
155 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS:
156 formatted_content = self._format_domain_id_hyperlinks(
157 document_content, sources
158 )
159 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true
160 formatted_content = self._format_domain_id_always_hyperlinks(
161 document_content, sources
162 )
163 else:
164 formatted_content = document_content
166 # Rebuild document
167 return formatted_content + sources_content
169 def _find_sources_section(self, content: str) -> int:
170 """Find the start of the sources/references section."""
171 return find_sources_section(content)
173 def _parse_sources(
174 self, sources_content: str
175 ) -> Dict[str, Tuple[str, str]]:
176 """
177 Parse sources section to extract citation numbers, titles, and URLs.
179 Returns:
180 Dictionary mapping citation number to (title, url) tuple
181 """
182 sources = {}
183 matches = list(self.sources_pattern.finditer(sources_content))
185 for match in matches:
186 citation_nums_str = match.group(1)
187 title = match.group(2).strip()
188 url = match.group(3).strip() if match.group(3) else ""
190 # Handle comma-separated citation numbers like [36, 3]
191 # Split by comma and strip whitespace
192 individual_nums = [
193 num.strip() for num in citation_nums_str.split(",")
194 ]
196 # Add an entry for each individual number
197 for num in individual_nums:
198 sources[num] = (title, url)
200 return sources
202 def _format_number_hyperlinks(
203 self, content: str, sources: Dict[str, Tuple[str, str]]
204 ) -> str:
205 """Replace [1] with hyperlinked version where only the number is linked."""
206 # Filter sources that have URLs
207 url_sources = {
208 num: (title, url) for num, (title, url) in sources.items() if url
209 }
211 # Create formatter for citations with number hyperlinks
212 def format_number_link(citation_num, data):
213 _, url = data
214 return f"[[{citation_num}]]({url})"
216 # Handle comma-separated citations like [1, 2, 3]
217 content = self._replace_comma_citations(
218 content, url_sources, format_number_link
219 )
221 formatter = self._create_citation_formatter(
222 url_sources, format_number_link
223 )
225 # Handle individual citations
226 def replace_citation(match):
227 return (
228 formatter(match.group(1))
229 if match.group(1) in url_sources
230 else match.group(0)
231 )
233 content = self.citation_pattern.sub(replace_citation, content)
235 # Also handle "Source X" patterns
236 return self.source_word_pattern.sub(
237 self._create_source_word_replacer(formatter), content
238 )
240 def _format_domain_hyperlinks(
241 self, content: str, sources: Dict[str, Tuple[str, str]]
242 ) -> str:
243 """Replace [1] with [domain.com] hyperlinked version."""
245 # Filter sources that have URLs
246 url_sources = {
247 num: (title, url) for num, (title, url) in sources.items() if url
248 }
250 # Create formatter for citations with domain hyperlinks
251 def format_domain_link(citation_num, data):
252 _, url = data
253 domain = self._extract_domain(url)
254 return f"[[{domain}]]({url})"
256 # Handle comma-separated citations like [1, 2, 3]
257 content = self._replace_comma_citations(
258 content, url_sources, format_domain_link
259 )
261 formatter = self._create_citation_formatter(
262 url_sources, format_domain_link
263 )
265 # Handle individual citations
266 def replace_citation(match):
267 return (
268 formatter(match.group(1))
269 if match.group(1) in url_sources
270 else match.group(0)
271 )
273 content = self.citation_pattern.sub(replace_citation, content)
275 # Also handle "Source X" patterns
276 return self.source_word_pattern.sub(
277 self._create_source_word_replacer(formatter), content
278 )
280 def _format_domain_id_hyperlinks(
281 self, content: str, sources: Dict[str, Tuple[str, str]]
282 ) -> str:
283 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs."""
284 # First, create a mapping of domains to their citation numbers
285 domain_citations = {}
287 for citation_num, (title, url) in sources.items():
288 if url: 288 ↛ 287line 288 didn't jump to line 287 because the condition on line 288 was always true
289 domain = self._extract_domain(url)
290 if domain not in domain_citations:
291 domain_citations[domain] = []
292 domain_citations[domain].append((citation_num, url))
294 # Create a mapping from citation number to domain with ID
295 citation_to_domain_id = {}
296 for domain, citations in domain_citations.items():
297 if len(citations) > 1:
298 # Multiple citations from same domain - add hyphen and number
299 for idx, (citation_num, url) in enumerate(citations, 1):
300 citation_to_domain_id[citation_num] = (
301 f"{domain}-{idx}",
302 url,
303 )
304 else:
305 # Single citation from domain - no ID needed
306 citation_num, url = citations[0]
307 citation_to_domain_id[citation_num] = (domain, url)
309 # Create formatter for citations with domain_id hyperlinks
310 def format_domain_id_link(citation_num, data):
311 domain_id, url = data
312 return f"[[{domain_id}]]({url})"
314 # Handle comma-separated citations
315 content = self._replace_comma_citations(
316 content, citation_to_domain_id, format_domain_id_link
317 )
319 formatter = self._create_citation_formatter(
320 citation_to_domain_id, format_domain_id_link
321 )
323 # Handle individual citations
324 def replace_citation(match):
325 return (
326 formatter(match.group(1))
327 if match.group(1) in citation_to_domain_id
328 else match.group(0)
329 )
331 content = self.citation_pattern.sub(replace_citation, content)
333 # Also handle "Source X" patterns
334 return self.source_word_pattern.sub(
335 self._create_source_word_replacer(formatter), content
336 )
338 def _format_domain_id_always_hyperlinks(
339 self, content: str, sources: Dict[str, Tuple[str, str]]
340 ) -> str:
341 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs."""
342 # First, create a mapping of domains to their citation numbers
343 domain_citations = {}
345 for citation_num, (title, url) in sources.items():
346 if url: 346 ↛ 345line 346 didn't jump to line 345 because the condition on line 346 was always true
347 domain = self._extract_domain(url)
348 if domain not in domain_citations:
349 domain_citations[domain] = []
350 domain_citations[domain].append((citation_num, url))
352 # Create a mapping from citation number to domain with ID
353 citation_to_domain_id = {}
354 for domain, citations in domain_citations.items():
355 # Always add hyphen and number for consistency
356 for idx, (citation_num, url) in enumerate(citations, 1):
357 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url)
359 # Create formatter for citations with domain_id hyperlinks
360 def format_domain_id_link(citation_num, data):
361 domain_id, url = data
362 return f"[[{domain_id}]]({url})"
364 # Handle comma-separated citations
365 content = self._replace_comma_citations(
366 content, citation_to_domain_id, format_domain_id_link
367 )
369 formatter = self._create_citation_formatter(
370 citation_to_domain_id, format_domain_id_link
371 )
373 # Handle individual citations
374 def replace_citation(match):
375 return (
376 formatter(match.group(1))
377 if match.group(1) in citation_to_domain_id
378 else match.group(0)
379 )
381 content = self.citation_pattern.sub(replace_citation, content)
383 # Also handle "Source X" patterns
384 return self.source_word_pattern.sub(
385 self._create_source_word_replacer(formatter), content
386 )
388 def _to_superscript(self, text: str) -> str:
389 """Convert text to Unicode superscript."""
390 superscript_map = {
391 "0": "⁰",
392 "1": "¹",
393 "2": "²",
394 "3": "³",
395 "4": "⁴",
396 "5": "⁵",
397 "6": "⁶",
398 "7": "⁷",
399 "8": "⁸",
400 "9": "⁹",
401 }
402 return "".join(superscript_map.get(c, c) for c in text)
404 def _extract_domain(self, url: str) -> str:
405 """Extract domain name from URL."""
406 try:
407 parsed = urlparse(url)
408 domain = parsed.netloc
409 # Remove www. prefix if present
410 if domain.startswith("www."):
411 domain = domain[4:]
412 # Keep known domains as-is
413 known_domains = {
414 "arxiv.org": "arxiv.org",
415 "github.com": "github.com",
416 "reddit.com": "reddit.com",
417 "youtube.com": "youtube.com",
418 "pypi.org": "pypi.org",
419 "milvus.io": "milvus.io",
420 "medium.com": "medium.com",
421 }
423 for known, display in known_domains.items():
424 if known in domain:
425 return display
427 # For other domains, extract main domain
428 parts = domain.split(".")
429 if len(parts) >= 2:
430 return ".".join(parts[-2:])
431 return domain
432 except (ValueError, AttributeError):
433 return "source"
436class QuartoExporter:
437 """Export markdown documents to Quarto (.qmd) format."""
439 def __init__(self):
440 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
441 self.citation_pattern = re.compile(
442 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"
443 )
444 self.comma_citation_pattern = re.compile(
445 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"
446 )
448 def export_to_quarto(self, content: str, title: str = None) -> str:
449 """
450 Convert markdown document to Quarto format.
452 Args:
453 content: Markdown content
454 title: Document title (if None, will extract from content)
456 Returns:
457 Quarto formatted content
458 """
459 # Extract title from markdown if not provided
460 if not title:
461 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
462 title = title_match.group(1) if title_match else "Research Report"
464 # Create Quarto YAML header
465 from datetime import datetime, UTC
467 current_date = datetime.now(UTC).strftime("%Y-%m-%d")
468 yaml_header = f"""---
469title: "{title}"
470author: "Local Deep Research"
471date: "{current_date}"
472format:
473 html:
474 toc: true
475 toc-depth: 3
476 number-sections: true
477 pdf:
478 toc: true
479 number-sections: true
480 colorlinks: true
481bibliography: references.bib
482csl: apa.csl
483---
485"""
487 # Process content
488 processed_content = content
490 # First handle comma-separated citations like [1, 2, 3]
491 def replace_comma_citations(match):
492 citation_nums = match.group(1)
493 # Split by comma and strip whitespace
494 nums = [num.strip() for num in citation_nums.split(",")]
495 refs = [f"@ref{num}" for num in nums]
496 return f"[{', '.join(refs)}]"
498 processed_content = self.comma_citation_pattern.sub(
499 replace_comma_citations, processed_content
500 )
502 # Then convert individual citations to Quarto format [@citation]
503 def replace_citation(match):
504 citation_num = match.group(1)
505 return f"[@ref{citation_num}]"
507 processed_content = self.citation_pattern.sub(
508 replace_citation, processed_content
509 )
511 # Generate bibliography file content
512 bib_content = self._generate_bibliography(content)
514 # Add note about bibliography file
515 bibliography_note = (
516 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n"
517 + bib_content
518 + "\n```\n:::\n"
519 )
521 return yaml_header + processed_content + bibliography_note
523 def _generate_bibliography(self, content: str) -> str:
524 """Generate BibTeX bibliography from sources."""
525 sources_pattern = re.compile(
526 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
527 )
529 bibliography = ""
530 matches = list(sources_pattern.finditer(content))
532 for match in matches:
533 citation_num = match.group(1)
534 title = match.group(2).strip()
535 url = match.group(3).strip() if match.group(3) else ""
537 # Generate BibTeX entry
538 bib_entry = f"@misc{{ref{citation_num},\n"
539 bib_entry += f' title = "{{{title}}}",\n'
540 if url:
541 bib_entry += f" url = {{{url}}},\n"
542 bib_entry += f' howpublished = "\\url{{{url}}}",\n'
543 bib_entry += f" year = {{{2024}}},\n"
544 bib_entry += ' note = "Accessed: \\today"\n'
545 bib_entry += "}\n"
547 bibliography += bib_entry + "\n"
549 return bibliography.strip()
552class RISExporter:
553 """Export references to RIS format for reference managers like Zotero."""
555 def __init__(self):
556 self.sources_pattern = re.compile(
557 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
558 re.MULTILINE,
559 )
561 def export_to_ris(self, content: str) -> str:
562 """
563 Extract references from markdown and convert to RIS format.
565 Args:
566 content: Markdown content with sources
568 Returns:
569 RIS formatted references
570 """
571 # Find sources section
572 sources_start = find_sources_section(content)
573 if sources_start == -1:
574 return ""
576 # Find the end of the first sources section (before any other major section)
577 sources_content = content[sources_start:]
579 # Look for the next major section to avoid duplicates
580 next_section_markers = [
581 "\n## ALL SOURCES",
582 "\n### ALL SOURCES",
583 "\n## Research Metrics",
584 "\n### Research Metrics",
585 "\n## SEARCH QUESTIONS",
586 "\n### SEARCH QUESTIONS",
587 "\n## DETAILED FINDINGS",
588 "\n### DETAILED FINDINGS",
589 "\n---", # Horizontal rule often separates sections
590 ]
592 sources_end = len(sources_content)
593 for marker in next_section_markers:
594 pos = sources_content.find(marker)
595 if pos != -1 and pos < sources_end:
596 sources_end = pos
598 sources_content = sources_content[:sources_end]
600 # Parse sources and generate RIS entries
601 ris_entries = []
602 seen_refs = set() # Track which references we've already processed
604 # Split sources into individual entries
605 import re
607 # Pattern to match each source entry
608 source_entry_pattern = re.compile(
609 r"^\[(\d+)\]\s*(.+?)(?=^\[\d+\]|\Z)", re.MULTILINE | re.DOTALL
610 )
612 for match in source_entry_pattern.finditer(sources_content):
613 citation_num = match.group(1)
614 entry_text = match.group(2).strip()
616 # Extract the title (first line)
617 lines = entry_text.split("\n")
618 title = lines[0].strip()
620 # Extract URL, DOI, and other metadata from subsequent lines
621 url = ""
622 metadata = {}
623 for line in lines[1:]:
624 line = line.strip()
625 if line.startswith("URL:"):
626 url = line[4:].strip()
627 elif line.startswith("DOI:"):
628 metadata["doi"] = line[4:].strip()
629 elif line.startswith("Published in"):
630 metadata["journal"] = line[12:].strip()
631 # Add more metadata parsing as needed
632 elif line: 632 ↛ 623line 632 didn't jump to line 623 because the condition on line 632 was always true
633 # Store other lines as additional metadata
634 if "additional" not in metadata: 634 ↛ 636line 634 didn't jump to line 636 because the condition on line 634 was always true
635 metadata["additional"] = []
636 metadata["additional"].append(line)
638 # Combine title with additional metadata lines for full context
639 full_text = entry_text
641 # Create a unique key to avoid duplicates
642 ref_key = (citation_num, title, url)
643 if ref_key not in seen_refs: 643 ↛ 612line 643 didn't jump to line 612 because the condition on line 643 was always true
644 seen_refs.add(ref_key)
645 # Create RIS entry with full text for metadata extraction
646 ris_entry = self._create_ris_entry(
647 citation_num, full_text, url, metadata
648 )
649 ris_entries.append(ris_entry)
651 return "\n".join(ris_entries)
653 def _create_ris_entry(
654 self, ref_id: str, full_text: str, url: str = "", metadata: dict = None
655 ) -> str:
656 """Create a single RIS entry."""
657 lines = []
659 # Parse metadata from full text
660 import re
662 if metadata is None:
663 metadata = {}
665 # Extract title from first line
666 lines = full_text.split("\n")
667 title = lines[0].strip()
669 # Extract year from full text (looks for 4-digit year)
670 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text)
671 year = year_match.group(1) if year_match else None
673 # Extract authors if present (looks for "by Author1, Author2")
674 authors_match = re.search(
675 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE
676 )
677 authors = []
678 if authors_match:
679 authors_text = authors_match.group(1)
680 # Split by 'and' or ','
681 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text)
682 authors = [a.strip() for a in author_parts if a.strip()]
684 # Extract DOI from metadata or text
685 doi = metadata.get("doi")
686 if not doi:
687 doi_match = re.search(
688 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE
689 )
690 doi = doi_match.group(1) if doi_match else None
692 # Clean title - remove author and metadata info for cleaner title
693 clean_title = title
694 if authors_match and authors_match.start() < len(title):
695 clean_title = (
696 title[: authors_match.start()] + title[authors_match.end() :]
697 if authors_match.end() < len(title)
698 else title[: authors_match.start()]
699 )
700 clean_title = re.sub(
701 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE
702 )
703 clean_title = re.sub(
704 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE
705 )
706 clean_title = re.sub(
707 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE
708 )
709 clean_title = re.sub(
710 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE
711 )
712 clean_title = clean_title.strip()
714 # TY - Type of reference (ELEC for electronic source/website)
715 lines.append("TY - ELEC")
717 # ID - Reference ID
718 lines.append(f"ID - ref{ref_id}")
720 # TI - Title
721 lines.append(f"TI - {clean_title if clean_title else title}")
723 # AU - Authors
724 for author in authors:
725 lines.append(f"AU - {author}")
727 # DO - DOI
728 if doi:
729 lines.append(f"DO - {doi}")
731 # PY - Publication year (if found in title)
732 if year:
733 lines.append(f"PY - {year}")
735 # UR - URL
736 if url:
737 lines.append(f"UR - {url}")
739 # Try to extract domain as publisher
740 try:
741 from urllib.parse import urlparse
743 parsed = urlparse(url)
744 domain = parsed.netloc
745 if domain.startswith("www."):
746 domain = domain[4:]
747 # Extract readable publisher name from domain
748 if domain == "github.com" or domain.endswith(".github.com"):
749 lines.append("PB - GitHub")
750 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"):
751 lines.append("PB - arXiv")
752 elif domain == "reddit.com" or domain.endswith(".reddit.com"):
753 lines.append("PB - Reddit")
754 elif ( 754 ↛ 759line 754 didn't jump to line 759 because the condition on line 754 was never true
755 domain == "youtube.com"
756 or domain == "m.youtube.com"
757 or domain.endswith(".youtube.com")
758 ):
759 lines.append("PB - YouTube")
760 elif domain == "medium.com" or domain.endswith(".medium.com"): 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true
761 lines.append("PB - Medium")
762 elif domain == "pypi.org" or domain.endswith(".pypi.org"): 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true
763 lines.append("PB - Python Package Index (PyPI)")
764 else:
765 # Use domain as publisher
766 lines.append(f"PB - {domain}")
767 except (ValueError, AttributeError):
768 pass
770 # Y1 - Year accessed (current year)
771 from datetime import datetime, UTC
773 current_year = datetime.now(UTC).year
774 lines.append(f"Y1 - {current_year}")
776 # DA - Date accessed
777 current_date = datetime.now(UTC).strftime("%Y/%m/%d")
778 lines.append(f"DA - {current_date}")
780 # LA - Language
781 lines.append("LA - en")
783 # ER - End of reference
784 lines.append("ER - ")
786 return "\n".join(lines)
789class LaTeXExporter:
790 """Export markdown documents to LaTeX format."""
792 def __init__(self):
793 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
794 self.citation_pattern = re.compile(r"[\[【](\d+)[\]】]")
795 self.heading_patterns = [
796 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"),
797 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"),
798 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"),
799 ]
800 self.emphasis_patterns = [
801 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"),
802 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"),
803 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"),
804 ]
806 def export_to_latex(self, content: str) -> str:
807 """
808 Convert markdown document to LaTeX format.
810 Args:
811 content: Markdown content
813 Returns:
814 LaTeX formatted content
815 """
816 latex_content = self._create_latex_header()
818 # Convert markdown to LaTeX
819 body_content = content
821 # Escape special LaTeX characters but preserve math mode
822 # Split by $ to preserve math sections
823 parts = body_content.split("$")
824 for i in range(len(parts)):
825 # Even indices are outside math mode
826 if i % 2 == 0:
827 # Only escape if not inside $$
828 if not (
829 i > 0
830 and parts[i - 1] == ""
831 and i < len(parts) - 1
832 and parts[i + 1] == ""
833 ):
834 # Preserve certain patterns that will be processed later
835 # like headings (#), emphasis (*), and citations ([n])
836 lines = parts[i].split("\n")
837 for j, line in enumerate(lines):
838 # Don't escape lines that start with # (headings)
839 if not line.strip().startswith("#"):
840 # Don't escape emphasis markers or citations for now
841 # They'll be handled by their own patterns
842 temp_line = line
843 # Escape special chars except *, #, [, ]
844 temp_line = temp_line.replace("&", r"\&")
845 temp_line = temp_line.replace("%", r"\%")
846 temp_line = temp_line.replace("_", r"\_")
847 # Don't escape { } inside citations
848 lines[j] = temp_line
849 parts[i] = "\n".join(lines)
850 body_content = "$".join(parts)
852 # Convert headings
853 for pattern, replacement in self.heading_patterns:
854 body_content = pattern.sub(replacement, body_content)
856 # Convert emphasis
857 for pattern, replacement in self.emphasis_patterns:
858 body_content = pattern.sub(replacement, body_content)
860 # Convert citations to LaTeX \cite{} format
861 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content)
863 # Convert lists
864 body_content = self._convert_lists(body_content)
866 # Add body content
867 latex_content += body_content
869 # Add bibliography section
870 latex_content += self._create_bibliography(content)
872 # Add footer
873 latex_content += self._create_latex_footer()
875 return latex_content
877 def _create_latex_header(self) -> str:
878 """Create LaTeX document header."""
879 return r"""\documentclass[12pt]{article}
880\usepackage[utf8]{inputenc}
881\usepackage{hyperref}
882\usepackage{cite}
883\usepackage{url}
885\title{Research Report}
886\date{\today}
888\begin{document}
889\maketitle
891"""
893 def _create_latex_footer(self) -> str:
894 """Create LaTeX document footer."""
895 return "\n\\end{document}\n"
897 def _escape_latex(self, text: str) -> str:
898 """Escape special LaTeX characters in text."""
899 # Escape special LaTeX characters
900 replacements = [
901 ("\\", r"\textbackslash{}"), # Must be first
902 ("&", r"\&"),
903 ("%", r"\%"),
904 ("$", r"\$"),
905 ("#", r"\#"),
906 ("_", r"\_"),
907 ("{", r"\{"),
908 ("}", r"\}"),
909 ("~", r"\textasciitilde{}"),
910 ("^", r"\textasciicircum{}"),
911 ]
913 for old, new in replacements:
914 text = text.replace(old, new)
916 return text
918 def _convert_lists(self, content: str) -> str:
919 """Convert markdown lists to LaTeX format."""
920 # Simple conversion for bullet points
921 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE)
923 # Add itemize environment around list items
924 lines = content.split("\n")
925 result = []
926 in_list = False
928 for line in lines:
929 if line.strip().startswith("\\item"):
930 if not in_list:
931 result.append("\\begin{itemize}")
932 in_list = True
933 result.append(line)
934 else:
935 if in_list and line.strip():
936 result.append("\\end{itemize}")
937 in_list = False
938 result.append(line)
940 if in_list:
941 result.append("\\end{itemize}")
943 return "\n".join(result)
945 def _create_bibliography(self, content: str) -> str:
946 """Extract sources and create LaTeX bibliography."""
947 sources_start = find_sources_section(content)
948 if sources_start == -1:
949 return ""
951 sources_content = content[sources_start:]
952 pattern = re.compile(
953 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
954 )
956 bibliography = "\n\\begin{thebibliography}{99}\n"
958 for match in pattern.finditer(sources_content):
959 citation_num = match.group(1)
960 title = match.group(2).strip()
961 url = match.group(3).strip() if match.group(3) else ""
963 # Escape special LaTeX characters in title
964 escaped_title = self._escape_latex(title)
966 if url:
967 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n"
968 else:
969 bibliography += (
970 f"\\bibitem{{{citation_num}}} {escaped_title}.\n"
971 )
973 bibliography += "\\end{thebibliography}\n"
975 return bibliography