Coverage for src / local_deep_research / text_optimization / citation_formatter.py: 93%
433 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""Citation formatter for adding hyperlinks and alternative citation styles."""
3import re
4from enum import Enum
5from typing import Dict, Tuple
6from urllib.parse import urlparse
9class CitationMode(Enum):
10 """Available citation formatting modes."""
12 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks
13 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks
14 DOMAIN_ID_HYPERLINKS = (
15 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs
16 )
17 DOMAIN_ID_ALWAYS_HYPERLINKS = (
18 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs
19 )
20 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks
23class CitationFormatter:
24 """Formats citations in markdown documents with various styles."""
26 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS):
27 self.mode = mode
28 # Use negative lookbehind and lookahead to avoid matching already formatted citations
29 self.citation_pattern = re.compile(r"(?<!\[)\[(\d+)\](?!\])")
30 self.comma_citation_pattern = re.compile(r"\[(\d+(?:,\s*\d+)+)\]")
31 # Also match "Source X" or "source X" patterns
32 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b")
33 self.sources_pattern = re.compile(
34 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
35 re.MULTILINE,
36 )
38 def _create_source_word_replacer(self, formatter_func):
39 """Create a replacement function for 'Source X' patterns.
41 Args:
42 formatter_func: A function that takes citation_num and returns formatted text
44 Returns:
45 A replacement function for use with regex sub
46 """
48 def replace_source_word(match):
49 citation_num = match.group(1)
50 return formatter_func(citation_num)
52 return replace_source_word
54 def _create_citation_formatter(self, sources_dict, format_pattern):
55 """Create a formatter function for citations.
57 Args:
58 sources_dict: Dictionary mapping citation numbers to data
59 format_pattern: A callable that takes (citation_num, data) and returns formatted string
61 Returns:
62 A function that formats citations or returns fallback
63 """
65 def formatter(citation_num):
66 if citation_num in sources_dict:
67 data = sources_dict[citation_num]
68 return format_pattern(citation_num, data)
69 return f"[{citation_num}]"
71 return formatter
73 def format_document(self, content: str) -> str:
74 """
75 Format citations in the document according to the selected mode.
77 Args:
78 content: The markdown content to format
80 Returns:
81 Formatted markdown content
82 """
83 if self.mode == CitationMode.NO_HYPERLINKS:
84 return content
86 # Extract sources section
87 sources_start = self._find_sources_section(content)
88 if sources_start == -1:
89 return content
91 document_content = content[:sources_start]
92 sources_content = content[sources_start:]
94 # Parse sources
95 sources = self._parse_sources(sources_content)
97 # Format citations in document
98 if self.mode == CitationMode.NUMBER_HYPERLINKS:
99 formatted_content = self._format_number_hyperlinks(
100 document_content, sources
101 )
102 elif self.mode == CitationMode.DOMAIN_HYPERLINKS:
103 formatted_content = self._format_domain_hyperlinks(
104 document_content, sources
105 )
106 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS:
107 formatted_content = self._format_domain_id_hyperlinks(
108 document_content, sources
109 )
110 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 110 ↛ 115line 110 didn't jump to line 115 because the condition on line 110 was always true
111 formatted_content = self._format_domain_id_always_hyperlinks(
112 document_content, sources
113 )
114 else:
115 formatted_content = document_content
117 # Rebuild document
118 return formatted_content + sources_content
120 def _find_sources_section(self, content: str) -> int:
121 """Find the start of the sources/references section."""
122 patterns = [
123 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)",
124 r"^(?:Sources|References|Bibliography|Citations):?\s*$",
125 ]
127 for pattern in patterns:
128 match = re.search(pattern, content, re.MULTILINE | re.IGNORECASE)
129 if match:
130 return match.start()
132 return -1
134 def _parse_sources(
135 self, sources_content: str
136 ) -> Dict[str, Tuple[str, str]]:
137 """
138 Parse sources section to extract citation numbers, titles, and URLs.
140 Returns:
141 Dictionary mapping citation number to (title, url) tuple
142 """
143 sources = {}
144 matches = list(self.sources_pattern.finditer(sources_content))
146 for match in matches:
147 citation_nums_str = match.group(1)
148 title = match.group(2).strip()
149 url = match.group(3).strip() if match.group(3) else ""
151 # Handle comma-separated citation numbers like [36, 3]
152 # Split by comma and strip whitespace
153 individual_nums = [
154 num.strip() for num in citation_nums_str.split(",")
155 ]
157 # Add an entry for each individual number
158 for num in individual_nums:
159 sources[num] = (title, url)
161 return sources
163 def _format_number_hyperlinks(
164 self, content: str, sources: Dict[str, Tuple[str, str]]
165 ) -> str:
166 """Replace [1] with hyperlinked version where only the number is linked."""
168 # First handle comma-separated citations like [1, 2, 3]
169 def replace_comma_citations(match):
170 citation_nums = match.group(1)
171 # Split by comma and strip whitespace
172 nums = [num.strip() for num in citation_nums.split(",")]
173 formatted_citations = []
175 for num in nums:
176 if num in sources and sources[num][1]: 176 ↛ 180line 176 didn't jump to line 180 because the condition on line 176 was always true
177 url = sources[num][1]
178 formatted_citations.append(f"[[{num}]]({url})")
179 else:
180 formatted_citations.append(f"[{num}]")
182 return "".join(formatted_citations)
184 content = self.comma_citation_pattern.sub(
185 replace_comma_citations, content
186 )
188 # Filter sources that have URLs
189 url_sources = {
190 num: (title, url) for num, (title, url) in sources.items() if url
191 }
193 # Create formatter for citations with number hyperlinks
194 def format_number_link(citation_num, data):
195 _, url = data
196 return f"[[{citation_num}]]({url})"
198 formatter = self._create_citation_formatter(
199 url_sources, format_number_link
200 )
202 # Handle individual citations
203 def replace_citation(match):
204 return (
205 formatter(match.group(1))
206 if match.group(1) in url_sources
207 else match.group(0)
208 )
210 content = self.citation_pattern.sub(replace_citation, content)
212 # Also handle "Source X" patterns
213 return self.source_word_pattern.sub(
214 self._create_source_word_replacer(formatter), content
215 )
217 def _format_domain_hyperlinks(
218 self, content: str, sources: Dict[str, Tuple[str, str]]
219 ) -> str:
220 """Replace [1] with [domain.com] hyperlinked version."""
222 # First handle comma-separated citations like [1, 2, 3]
223 def replace_comma_citations(match):
224 citation_nums = match.group(1)
225 # Split by comma and strip whitespace
226 nums = [num.strip() for num in citation_nums.split(",")]
227 formatted_citations = []
229 for num in nums:
230 if num in sources and sources[num][1]: 230 ↛ 235line 230 didn't jump to line 235 because the condition on line 230 was always true
231 url = sources[num][1]
232 domain = self._extract_domain(url)
233 formatted_citations.append(f"[[{domain}]]({url})")
234 else:
235 formatted_citations.append(f"[{num}]")
237 return "".join(formatted_citations)
239 content = self.comma_citation_pattern.sub(
240 replace_comma_citations, content
241 )
243 # Filter sources that have URLs
244 url_sources = {
245 num: (title, url) for num, (title, url) in sources.items() if url
246 }
248 # Create formatter for citations with domain hyperlinks
249 def format_domain_link(citation_num, data):
250 _, url = data
251 domain = self._extract_domain(url)
252 return f"[[{domain}]]({url})"
254 formatter = self._create_citation_formatter(
255 url_sources, format_domain_link
256 )
258 # Handle individual citations
259 def replace_citation(match):
260 return (
261 formatter(match.group(1))
262 if match.group(1) in url_sources
263 else match.group(0)
264 )
266 content = self.citation_pattern.sub(replace_citation, content)
268 # Also handle "Source X" patterns
269 return self.source_word_pattern.sub(
270 self._create_source_word_replacer(formatter), content
271 )
273 def _format_domain_id_hyperlinks(
274 self, content: str, sources: Dict[str, Tuple[str, str]]
275 ) -> str:
276 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs."""
277 # First, create a mapping of domains to their citation numbers
278 domain_citations = {}
280 for citation_num, (title, url) in sources.items():
281 if url: 281 ↛ 280line 281 didn't jump to line 280 because the condition on line 281 was always true
282 domain = self._extract_domain(url)
283 if domain not in domain_citations:
284 domain_citations[domain] = []
285 domain_citations[domain].append((citation_num, url))
287 # Create a mapping from citation number to domain with ID
288 citation_to_domain_id = {}
289 for domain, citations in domain_citations.items():
290 if len(citations) > 1:
291 # Multiple citations from same domain - add hyphen and number
292 for idx, (citation_num, url) in enumerate(citations, 1):
293 citation_to_domain_id[citation_num] = (
294 f"{domain}-{idx}",
295 url,
296 )
297 else:
298 # Single citation from domain - no ID needed
299 citation_num, url = citations[0]
300 citation_to_domain_id[citation_num] = (domain, url)
302 # First handle comma-separated citations
303 def replace_comma_citations(match):
304 citation_nums = match.group(1)
305 nums = [num.strip() for num in citation_nums.split(",")]
306 formatted_citations = []
308 for num in nums:
309 if num in citation_to_domain_id:
310 domain_id, url = citation_to_domain_id[num]
311 formatted_citations.append(f"[[{domain_id}]]({url})")
312 else:
313 formatted_citations.append(f"[{num}]")
315 return "".join(formatted_citations)
317 content = self.comma_citation_pattern.sub(
318 replace_comma_citations, content
319 )
321 # Create formatter for citations with domain_id hyperlinks
322 def format_domain_id_link(citation_num, data):
323 domain_id, url = data
324 return f"[[{domain_id}]]({url})"
326 formatter = self._create_citation_formatter(
327 citation_to_domain_id, format_domain_id_link
328 )
330 # Handle individual citations
331 def replace_citation(match):
332 return (
333 formatter(match.group(1))
334 if match.group(1) in citation_to_domain_id
335 else match.group(0)
336 )
338 content = self.citation_pattern.sub(replace_citation, content)
340 # Also handle "Source X" patterns
341 return self.source_word_pattern.sub(
342 self._create_source_word_replacer(formatter), content
343 )
345 def _format_domain_id_always_hyperlinks(
346 self, content: str, sources: Dict[str, Tuple[str, str]]
347 ) -> str:
348 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs."""
349 # First, create a mapping of domains to their citation numbers
350 domain_citations = {}
352 for citation_num, (title, url) in sources.items():
353 if url: 353 ↛ 352line 353 didn't jump to line 352 because the condition on line 353 was always true
354 domain = self._extract_domain(url)
355 if domain not in domain_citations:
356 domain_citations[domain] = []
357 domain_citations[domain].append((citation_num, url))
359 # Create a mapping from citation number to domain with ID
360 citation_to_domain_id = {}
361 for domain, citations in domain_citations.items():
362 # Always add hyphen and number for consistency
363 for idx, (citation_num, url) in enumerate(citations, 1):
364 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url)
366 # First handle comma-separated citations
367 def replace_comma_citations(match):
368 citation_nums = match.group(1)
369 nums = [num.strip() for num in citation_nums.split(",")]
370 formatted_citations = []
372 for num in nums:
373 if num in citation_to_domain_id: 373 ↛ 377line 373 didn't jump to line 377 because the condition on line 373 was always true
374 domain_id, url = citation_to_domain_id[num]
375 formatted_citations.append(f"[[{domain_id}]]({url})")
376 else:
377 formatted_citations.append(f"[{num}]")
379 return "".join(formatted_citations)
381 content = self.comma_citation_pattern.sub(
382 replace_comma_citations, content
383 )
385 # Create formatter for citations with domain_id hyperlinks
386 def format_domain_id_link(citation_num, data):
387 domain_id, url = data
388 return f"[[{domain_id}]]({url})"
390 formatter = self._create_citation_formatter(
391 citation_to_domain_id, format_domain_id_link
392 )
394 # Handle individual citations
395 def replace_citation(match):
396 return (
397 formatter(match.group(1))
398 if match.group(1) in citation_to_domain_id
399 else match.group(0)
400 )
402 content = self.citation_pattern.sub(replace_citation, content)
404 # Also handle "Source X" patterns
405 return self.source_word_pattern.sub(
406 self._create_source_word_replacer(formatter), content
407 )
409 def _to_superscript(self, text: str) -> str:
410 """Convert text to Unicode superscript."""
411 superscript_map = {
412 "0": "⁰",
413 "1": "¹",
414 "2": "²",
415 "3": "³",
416 "4": "⁴",
417 "5": "⁵",
418 "6": "⁶",
419 "7": "⁷",
420 "8": "⁸",
421 "9": "⁹",
422 }
423 return "".join(superscript_map.get(c, c) for c in text)
425 def _extract_domain(self, url: str) -> str:
426 """Extract domain name from URL."""
427 try:
428 parsed = urlparse(url)
429 domain = parsed.netloc
430 # Remove www. prefix if present
431 if domain.startswith("www."):
432 domain = domain[4:]
433 # Keep known domains as-is
434 known_domains = {
435 "arxiv.org": "arxiv.org",
436 "github.com": "github.com",
437 "reddit.com": "reddit.com",
438 "youtube.com": "youtube.com",
439 "pypi.org": "pypi.org",
440 "milvus.io": "milvus.io",
441 "medium.com": "medium.com",
442 }
444 for known, display in known_domains.items():
445 if known in domain:
446 return display
448 # For other domains, extract main domain
449 parts = domain.split(".")
450 if len(parts) >= 2:
451 return ".".join(parts[-2:])
452 return domain
453 except:
454 return "source"
457class QuartoExporter:
458 """Export markdown documents to Quarto (.qmd) format."""
460 def __init__(self):
461 self.citation_pattern = re.compile(r"(?<!\[)\[(\d+)\](?!\])")
462 self.comma_citation_pattern = re.compile(r"\[(\d+(?:,\s*\d+)+)\]")
464 def export_to_quarto(self, content: str, title: str = None) -> str:
465 """
466 Convert markdown document to Quarto format.
468 Args:
469 content: Markdown content
470 title: Document title (if None, will extract from content)
472 Returns:
473 Quarto formatted content
474 """
475 # Extract title from markdown if not provided
476 if not title:
477 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
478 title = title_match.group(1) if title_match else "Research Report"
480 # Create Quarto YAML header
481 from datetime import datetime, UTC
483 current_date = datetime.now(UTC).strftime("%Y-%m-%d")
484 yaml_header = f"""---
485title: "{title}"
486author: "Local Deep Research"
487date: "{current_date}"
488format:
489 html:
490 toc: true
491 toc-depth: 3
492 number-sections: true
493 pdf:
494 toc: true
495 number-sections: true
496 colorlinks: true
497bibliography: references.bib
498csl: apa.csl
499---
501"""
503 # Process content
504 processed_content = content
506 # First handle comma-separated citations like [1, 2, 3]
507 def replace_comma_citations(match):
508 citation_nums = match.group(1)
509 # Split by comma and strip whitespace
510 nums = [num.strip() for num in citation_nums.split(",")]
511 refs = [f"@ref{num}" for num in nums]
512 return f"[{', '.join(refs)}]"
514 processed_content = self.comma_citation_pattern.sub(
515 replace_comma_citations, processed_content
516 )
518 # Then convert individual citations to Quarto format [@citation]
519 def replace_citation(match):
520 citation_num = match.group(1)
521 return f"[@ref{citation_num}]"
523 processed_content = self.citation_pattern.sub(
524 replace_citation, processed_content
525 )
527 # Generate bibliography file content
528 bib_content = self._generate_bibliography(content)
530 # Add note about bibliography file
531 bibliography_note = (
532 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n"
533 + bib_content
534 + "\n```\n:::\n"
535 )
537 return yaml_header + processed_content + bibliography_note
539 def _generate_bibliography(self, content: str) -> str:
540 """Generate BibTeX bibliography from sources."""
541 sources_pattern = re.compile(
542 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
543 )
545 bibliography = ""
546 matches = list(sources_pattern.finditer(content))
548 for match in matches:
549 citation_num = match.group(1)
550 title = match.group(2).strip()
551 url = match.group(3).strip() if match.group(3) else ""
553 # Generate BibTeX entry
554 bib_entry = f"@misc{{ref{citation_num},\n"
555 bib_entry += f' title = "{{{title}}}",\n'
556 if url:
557 bib_entry += f" url = {{{url}}},\n"
558 bib_entry += f' howpublished = "\\url{{{url}}}",\n'
559 bib_entry += f" year = {{{2024}}},\n"
560 bib_entry += ' note = "Accessed: \\today"\n'
561 bib_entry += "}\n"
563 bibliography += bib_entry + "\n"
565 return bibliography.strip()
568class RISExporter:
569 """Export references to RIS format for reference managers like Zotero."""
571 def __init__(self):
572 self.sources_pattern = re.compile(
573 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
574 re.MULTILINE,
575 )
577 def export_to_ris(self, content: str) -> str:
578 """
579 Extract references from markdown and convert to RIS format.
581 Args:
582 content: Markdown content with sources
584 Returns:
585 RIS formatted references
586 """
587 # Find sources section
588 sources_start = content.find("## Sources")
589 if sources_start == -1:
590 sources_start = content.find("## References")
591 if sources_start == -1:
592 sources_start = content.find("### Sources")
593 if sources_start == -1:
594 sources_start = content.find("### SOURCES")
596 if sources_start == -1:
597 return ""
599 # Find the end of the first sources section (before any other major section)
600 sources_content = content[sources_start:]
602 # Look for the next major section to avoid duplicates
603 next_section_markers = [
604 "\n## ALL SOURCES",
605 "\n### ALL SOURCES",
606 "\n## Research Metrics",
607 "\n### Research Metrics",
608 "\n## SEARCH QUESTIONS",
609 "\n### SEARCH QUESTIONS",
610 "\n## DETAILED FINDINGS",
611 "\n### DETAILED FINDINGS",
612 "\n---", # Horizontal rule often separates sections
613 ]
615 sources_end = len(sources_content)
616 for marker in next_section_markers:
617 pos = sources_content.find(marker)
618 if pos != -1 and pos < sources_end: 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true
619 sources_end = pos
621 sources_content = sources_content[:sources_end]
623 # Parse sources and generate RIS entries
624 ris_entries = []
625 seen_refs = set() # Track which references we've already processed
627 # Split sources into individual entries
628 import re
630 # Pattern to match each source entry
631 source_entry_pattern = re.compile(
632 r"^\[(\d+)\]\s*(.+?)(?=^\[\d+\]|\Z)", re.MULTILINE | re.DOTALL
633 )
635 for match in source_entry_pattern.finditer(sources_content):
636 citation_num = match.group(1)
637 entry_text = match.group(2).strip()
639 # Extract the title (first line)
640 lines = entry_text.split("\n")
641 title = lines[0].strip()
643 # Extract URL, DOI, and other metadata from subsequent lines
644 url = ""
645 metadata = {}
646 for line in lines[1:]:
647 line = line.strip()
648 if line.startswith("URL:"):
649 url = line[4:].strip()
650 elif line.startswith("DOI:"):
651 metadata["doi"] = line[4:].strip()
652 elif line.startswith("Published in"):
653 metadata["journal"] = line[12:].strip()
654 # Add more metadata parsing as needed
655 elif line: 655 ↛ 646line 655 didn't jump to line 646 because the condition on line 655 was always true
656 # Store other lines as additional metadata
657 if "additional" not in metadata: 657 ↛ 659line 657 didn't jump to line 659 because the condition on line 657 was always true
658 metadata["additional"] = []
659 metadata["additional"].append(line)
661 # Combine title with additional metadata lines for full context
662 full_text = entry_text
664 # Create a unique key to avoid duplicates
665 ref_key = (citation_num, title, url)
666 if ref_key not in seen_refs: 666 ↛ 635line 666 didn't jump to line 635 because the condition on line 666 was always true
667 seen_refs.add(ref_key)
668 # Create RIS entry with full text for metadata extraction
669 ris_entry = self._create_ris_entry(
670 citation_num, full_text, url, metadata
671 )
672 ris_entries.append(ris_entry)
674 return "\n".join(ris_entries)
676 def _create_ris_entry(
677 self, ref_id: str, full_text: str, url: str = "", metadata: dict = None
678 ) -> str:
679 """Create a single RIS entry."""
680 lines = []
682 # Parse metadata from full text
683 import re
685 if metadata is None: 685 ↛ 686line 685 didn't jump to line 686 because the condition on line 685 was never true
686 metadata = {}
688 # Extract title from first line
689 lines = full_text.split("\n")
690 title = lines[0].strip()
692 # Extract year from full text (looks for 4-digit year)
693 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text)
694 year = year_match.group(1) if year_match else None
696 # Extract authors if present (looks for "by Author1, Author2")
697 authors_match = re.search(
698 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE
699 )
700 authors = []
701 if authors_match:
702 authors_text = authors_match.group(1)
703 # Split by 'and' or ','
704 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text)
705 authors = [a.strip() for a in author_parts if a.strip()]
707 # Extract DOI from metadata or text
708 doi = metadata.get("doi")
709 if not doi:
710 doi_match = re.search(
711 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE
712 )
713 doi = doi_match.group(1) if doi_match else None
715 # Clean title - remove author and metadata info for cleaner title
716 clean_title = title
717 if authors_match and authors_match.start() < len(title):
718 clean_title = (
719 title[: authors_match.start()] + title[authors_match.end() :]
720 if authors_match.end() < len(title)
721 else title[: authors_match.start()]
722 )
723 clean_title = re.sub(
724 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE
725 )
726 clean_title = re.sub(
727 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE
728 )
729 clean_title = re.sub(
730 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE
731 )
732 clean_title = re.sub(
733 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE
734 )
735 clean_title = clean_title.strip()
737 # TY - Type of reference (ELEC for electronic source/website)
738 lines.append("TY - ELEC")
740 # ID - Reference ID
741 lines.append(f"ID - ref{ref_id}")
743 # TI - Title
744 lines.append(f"TI - {clean_title if clean_title else title}")
746 # AU - Authors
747 for author in authors:
748 lines.append(f"AU - {author}")
750 # DO - DOI
751 if doi:
752 lines.append(f"DO - {doi}")
754 # PY - Publication year (if found in title)
755 if year:
756 lines.append(f"PY - {year}")
758 # UR - URL
759 if url:
760 lines.append(f"UR - {url}")
762 # Try to extract domain as publisher
763 try:
764 from urllib.parse import urlparse
766 parsed = urlparse(url)
767 domain = parsed.netloc
768 if domain.startswith("www."):
769 domain = domain[4:]
770 # Extract readable publisher name from domain
771 if domain == "github.com" or domain.endswith(".github.com"):
772 lines.append("PB - GitHub")
773 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"):
774 lines.append("PB - arXiv")
775 elif domain == "reddit.com" or domain.endswith(".reddit.com"):
776 lines.append("PB - Reddit")
777 elif ( 777 ↛ 782line 777 didn't jump to line 782 because the condition on line 777 was never true
778 domain == "youtube.com"
779 or domain == "m.youtube.com"
780 or domain.endswith(".youtube.com")
781 ):
782 lines.append("PB - YouTube")
783 elif domain == "medium.com" or domain.endswith(".medium.com"): 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true
784 lines.append("PB - Medium")
785 elif domain == "pypi.org" or domain.endswith(".pypi.org"): 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true
786 lines.append("PB - Python Package Index (PyPI)")
787 else:
788 # Use domain as publisher
789 lines.append(f"PB - {domain}")
790 except:
791 pass
793 # Y1 - Year accessed (current year)
794 from datetime import datetime, UTC
796 current_year = datetime.now(UTC).year
797 lines.append(f"Y1 - {current_year}")
799 # DA - Date accessed
800 current_date = datetime.now(UTC).strftime("%Y/%m/%d")
801 lines.append(f"DA - {current_date}")
803 # LA - Language
804 lines.append("LA - en")
806 # ER - End of reference
807 lines.append("ER - ")
809 return "\n".join(lines)
812class LaTeXExporter:
813 """Export markdown documents to LaTeX format."""
815 def __init__(self):
816 self.citation_pattern = re.compile(r"\[(\d+)\]")
817 self.heading_patterns = [
818 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"),
819 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"),
820 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"),
821 ]
822 self.emphasis_patterns = [
823 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"),
824 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"),
825 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"),
826 ]
828 def export_to_latex(self, content: str) -> str:
829 """
830 Convert markdown document to LaTeX format.
832 Args:
833 content: Markdown content
835 Returns:
836 LaTeX formatted content
837 """
838 latex_content = self._create_latex_header()
840 # Convert markdown to LaTeX
841 body_content = content
843 # Escape special LaTeX characters but preserve math mode
844 # Split by $ to preserve math sections
845 parts = body_content.split("$")
846 for i in range(len(parts)):
847 # Even indices are outside math mode
848 if i % 2 == 0:
849 # Only escape if not inside $$
850 if not (
851 i > 0
852 and parts[i - 1] == ""
853 and i < len(parts) - 1
854 and parts[i + 1] == ""
855 ):
856 # Preserve certain patterns that will be processed later
857 # like headings (#), emphasis (*), and citations ([n])
858 lines = parts[i].split("\n")
859 for j, line in enumerate(lines):
860 # Don't escape lines that start with # (headings)
861 if not line.strip().startswith("#"):
862 # Don't escape emphasis markers or citations for now
863 # They'll be handled by their own patterns
864 temp_line = line
865 # Escape special chars except *, #, [, ]
866 temp_line = temp_line.replace("&", r"\&")
867 temp_line = temp_line.replace("%", r"\%")
868 temp_line = temp_line.replace("_", r"\_")
869 # Don't escape { } inside citations
870 lines[j] = temp_line
871 parts[i] = "\n".join(lines)
872 body_content = "$".join(parts)
874 # Convert headings
875 for pattern, replacement in self.heading_patterns:
876 body_content = pattern.sub(replacement, body_content)
878 # Convert emphasis
879 for pattern, replacement in self.emphasis_patterns:
880 body_content = pattern.sub(replacement, body_content)
882 # Convert citations to LaTeX \cite{} format
883 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content)
885 # Convert lists
886 body_content = self._convert_lists(body_content)
888 # Add body content
889 latex_content += body_content
891 # Add bibliography section
892 latex_content += self._create_bibliography(content)
894 # Add footer
895 latex_content += self._create_latex_footer()
897 return latex_content
899 def _create_latex_header(self) -> str:
900 """Create LaTeX document header."""
901 return r"""\documentclass[12pt]{article}
902\usepackage[utf8]{inputenc}
903\usepackage{hyperref}
904\usepackage{cite}
905\usepackage{url}
907\title{Research Report}
908\date{\today}
910\begin{document}
911\maketitle
913"""
915 def _create_latex_footer(self) -> str:
916 """Create LaTeX document footer."""
917 return "\n\\end{document}\n"
919 def _escape_latex(self, text: str) -> str:
920 """Escape special LaTeX characters in text."""
921 # Escape special LaTeX characters
922 replacements = [
923 ("\\", r"\textbackslash{}"), # Must be first
924 ("&", r"\&"),
925 ("%", r"\%"),
926 ("$", r"\$"),
927 ("#", r"\#"),
928 ("_", r"\_"),
929 ("{", r"\{"),
930 ("}", r"\}"),
931 ("~", r"\textasciitilde{}"),
932 ("^", r"\textasciicircum{}"),
933 ]
935 for old, new in replacements:
936 text = text.replace(old, new)
938 return text
940 def _convert_lists(self, content: str) -> str:
941 """Convert markdown lists to LaTeX format."""
942 # Simple conversion for bullet points
943 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE)
945 # Add itemize environment around list items
946 lines = content.split("\n")
947 result = []
948 in_list = False
950 for line in lines:
951 if line.strip().startswith("\\item"):
952 if not in_list:
953 result.append("\\begin{itemize}")
954 in_list = True
955 result.append(line)
956 else:
957 if in_list and line.strip():
958 result.append("\\end{itemize}")
959 in_list = False
960 result.append(line)
962 if in_list: 962 ↛ 963line 962 didn't jump to line 963 because the condition on line 962 was never true
963 result.append("\\end{itemize}")
965 return "\n".join(result)
967 def _create_bibliography(self, content: str) -> str:
968 """Extract sources and create LaTeX bibliography."""
969 sources_start = content.find("## Sources")
970 if sources_start == -1:
971 sources_start = content.find("## References")
973 if sources_start == -1:
974 return ""
976 sources_content = content[sources_start:]
977 pattern = re.compile(
978 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
979 )
981 bibliography = "\n\\begin{thebibliography}{99}\n"
983 for match in pattern.finditer(sources_content):
984 citation_num = match.group(1)
985 title = match.group(2).strip()
986 url = match.group(3).strip() if match.group(3) else ""
988 # Escape special LaTeX characters in title
989 escaped_title = self._escape_latex(title)
991 if url:
992 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n"
993 else:
994 bibliography += (
995 f"\\bibitem{{{citation_num}}} {escaped_title}.\n"
996 )
998 bibliography += "\\end{thebibliography}\n"
1000 return bibliography