Coverage for src/local_deep_research/text_optimization/citation_formatter.py: 96%
486 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Citation formatter for adding hyperlinks and alternative citation styles."""
3import re
4from enum import Enum
5from typing import Any, Dict, List, Tuple
6from urllib.parse import urlparse
8_SOURCES_SECTION_PATTERNS = [
9 re.compile(
10 r"^#{1,3}\s*(?:Sources|References|Bibliography|Citations)",
11 re.MULTILINE | re.IGNORECASE,
12 ),
13 re.compile(
14 r"^(?:Sources|References|Bibliography|Citations):?\s*$",
15 re.MULTILINE | re.IGNORECASE,
16 ),
17]
20def find_sources_section(content: str) -> int:
21 """Find the start position of the sources/references section in *content*.
23 Returns -1 if no section is found.
24 """
25 for pattern in _SOURCES_SECTION_PATTERNS:
26 match = pattern.search(content)
27 if match:
28 return match.start()
29 return -1
32class CitationMode(Enum):
33 """Available citation formatting modes."""
35 NUMBER_HYPERLINKS = "number_hyperlinks" # [1] with hyperlinks
36 DOMAIN_HYPERLINKS = "domain_hyperlinks" # [arxiv.org] with hyperlinks
37 DOMAIN_ID_HYPERLINKS = (
38 "domain_id_hyperlinks" # [arxiv.org] or [arxiv.org-1] with smart IDs
39 )
40 DOMAIN_ID_ALWAYS_HYPERLINKS = (
41 "domain_id_always_hyperlinks" # [arxiv.org-1] always with IDs
42 )
43 SOURCE_TAGGED_HYPERLINKS = "source_tagged_hyperlinks"
44 """Preserve the global citation number and prefix it with a short source
45 tag derived from the URL: known academic sources via ``URLClassifier``
46 (``arxiv-7``, ``pubmed-3``), domain otherwise (``nytimes.com-9``), and
47 ``local-N`` for empty / local URLs. Unlike DOMAIN_ID_* modes the
48 suffix is the original citation number, so labels never collide and
49 match the bibliography order: ``[1]`` arxiv + ``[2]`` openai + ``[3]``
50 arxiv -> ``arxiv-1``, ``openai-2``, ``arxiv-3``."""
51 NO_HYPERLINKS = "no_hyperlinks" # [1] without hyperlinks
54class CitationFormatter:
55 """Formats citations in markdown documents with various styles."""
57 def __init__(self, mode: CitationMode = CitationMode.NUMBER_HYPERLINKS):
58 self.mode = mode
59 # Use negative lookbehind and lookahead to avoid matching already formatted citations
60 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
61 self.citation_pattern = re.compile(
62 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"
63 )
64 self.comma_citation_pattern = re.compile(
65 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"
66 )
67 # Also match "Source X" or "source X" patterns
68 self.source_word_pattern = re.compile(r"\b[Ss]ource\s+(\d+)\b")
69 self.sources_pattern = re.compile(
70 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
71 re.MULTILINE,
72 )
74 def _create_source_word_replacer(self, formatter_func):
75 """Create a replacement function for 'Source X' patterns.
77 Args:
78 formatter_func: A function that takes citation_num and returns formatted text
80 Returns:
81 A replacement function for use with regex sub
82 """
84 def replace_source_word(match):
85 citation_num = match.group(1)
86 return formatter_func(citation_num)
88 return replace_source_word
90 def _create_citation_formatter(self, sources_dict, format_pattern):
91 """Create a formatter function for citations.
93 Args:
94 sources_dict: Dictionary mapping citation numbers to data
95 format_pattern: A callable that takes (citation_num, data) and returns formatted string
97 Returns:
98 A function that formats citations or returns fallback
99 """
101 def formatter(citation_num):
102 if citation_num in sources_dict:
103 data = sources_dict[citation_num]
104 return format_pattern(citation_num, data)
105 return f"[{citation_num}]"
107 return formatter
109 def _replace_comma_citations(self, content, lookup, format_one):
110 """Replace comma-separated citations like [1, 2, 3] using *lookup* and *format_one*.
112 Args:
113 content: Text to process
114 lookup: Dict mapping citation number (str) to data
115 format_one: ``(num, data) -> str`` callback that formats a single citation
116 """
118 def _replacer(match):
119 nums = [n.strip() for n in match.group(1).split(",")]
120 parts = []
121 for num in nums:
122 if num in lookup:
123 parts.append(format_one(num, lookup[num]))
124 else:
125 parts.append(f"[{num}]")
126 return "".join(parts)
128 return self.comma_citation_pattern.sub(_replacer, content)
130 def format_document(self, content: str) -> str:
131 """Format citations and return the concatenated answer + sources blob.
133 Kept for backward compatibility — most call sites only need the
134 concatenated string. New code that needs to persist answer-only
135 should use :meth:`format_document_split` instead so the boundary
136 is returned explicitly (no re-parsing of the concatenated output).
137 """
138 formatted_answer, sources_md = self.format_document_split(content)
139 return formatted_answer + sources_md
141 def format_document_split(self, content: str) -> Tuple[str, str]:
142 """Format citations and return (answer, sources_md) separately.
144 The boundary between the LLM's answer and the trailing Sources
145 section is computed inside this method. Callers that only want
146 the answer (e.g. the chat-mode save site) get a clean split
147 without re-applying a regex on concatenated output downstream.
149 Returns ``(content, "")`` when the formatter is in NO_HYPERLINKS
150 mode or when no Sources section can be found in ``content``.
151 """
152 if self.mode == CitationMode.NO_HYPERLINKS:
153 return content, ""
155 sources_start = self._find_sources_section(content)
156 if sources_start == -1:
157 return content, ""
159 document_content = content[:sources_start]
160 sources_content = content[sources_start:]
162 sources = self._parse_sources(sources_content)
164 if self.mode == CitationMode.NUMBER_HYPERLINKS:
165 formatted_content = self._format_number_hyperlinks(
166 document_content, sources
167 )
168 elif self.mode == CitationMode.DOMAIN_HYPERLINKS:
169 formatted_content = self._format_domain_hyperlinks(
170 document_content, sources
171 )
172 elif self.mode == CitationMode.DOMAIN_ID_HYPERLINKS:
173 formatted_content = self._format_domain_id_hyperlinks(
174 document_content, sources
175 )
176 elif self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS:
177 formatted_content = self._format_domain_id_always_hyperlinks(
178 document_content, sources
179 )
180 elif self.mode == CitationMode.SOURCE_TAGGED_HYPERLINKS: 180 ↛ 187line 180 didn't jump to line 187 because the condition on line 180 was always true
181 formatted_content = self._format_source_tagged_hyperlinks(
182 document_content,
183 sources,
184 self._parse_collections(sources_content),
185 )
186 else:
187 formatted_content = document_content
189 return formatted_content, sources_content
191 def apply_inline_hyperlinks(
192 self, content: str, sources: List[Dict[str, Any]]
193 ) -> str:
194 """Hyperlink ``[N]`` refs using a structured source list.
196 Dispatches on ``self.mode`` so the user's chosen citation
197 format (Settings → Report → Citation Format) is honored on
198 the fallback path the same way it is in
199 :meth:`format_document_split`. Inherits all the existing
200 per-mode guards (lookbehind/lookahead against ``[[1]]``,
201 comma-list handling like ``[1,2,3]``, ``Source N`` word form,
202 missing-index pass-through, lenticular bracket support).
204 Used as the safe fallback at save time when the LLM does NOT
205 emit a Sources section in its prose — the structured source
206 list (e.g. ``search_system.all_links_of_system``) is the
207 canonical source of URLs and indices.
208 """
209 if not content or not sources:
210 return content or ""
211 if self.mode == CitationMode.NO_HYPERLINKS:
212 return content
214 # Search-engine result dicts use either "url" or "link" for the
215 # destination — Searxng emits {"link": ..., "title": ..., "snippet": ...}
216 # (search_engine_searxng.py:538) and other engines use "url".
217 # Looking up only `s["url"]` silently dropped every Searxng-sourced
218 # citation, leaving the answer body with plain `[N]` brackets even
219 # though the Sources section beneath was fully populated. Accept
220 # both keys so the hyperlink fallback works regardless of engine.
221 def _src_url(s):
222 return s.get("url") or s.get("link") or ""
224 adapted: Dict[str, Tuple[str, str]] = {
225 str(s["index"]): (s.get("title", "Untitled"), _src_url(s))
226 for s in sources
227 if _src_url(s) and s.get("index") is not None
228 }
229 if not adapted: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 return content
232 # Per-mode dispatch — mirrors format_document_split so the user's
233 # chosen citation format applies on this fallback path too.
234 # Previously this was hard-coded to _format_number_hyperlinks,
235 # which meant chat-mode answers (which always hit this fallback
236 # because the langgraph-agent synthesis doesn't emit a ## Sources
237 # block in its prose) ignored the report.citation_format setting
238 # entirely — every chat answer came out as [[N]](url) even when
239 # the user picked domain-based or source-tagged formatting.
240 if self.mode == CitationMode.DOMAIN_HYPERLINKS:
241 return self._format_domain_hyperlinks(content, adapted)
242 if self.mode == CitationMode.DOMAIN_ID_HYPERLINKS:
243 return self._format_domain_id_hyperlinks(content, adapted)
244 if self.mode == CitationMode.DOMAIN_ID_ALWAYS_HYPERLINKS: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 return self._format_domain_id_always_hyperlinks(content, adapted)
246 if self.mode == CitationMode.SOURCE_TAGGED_HYPERLINKS:
247 # Pull collection names off the structured source dicts
248 # (format_links_to_markdown uses the same shape:
249 # link["metadata"]["collection_name"]) so the SOURCE_TAGGED
250 # formatter can surface library/RAG tags as the citation
251 # label when present.
252 collections: Dict[str, str] = {}
253 for s in sources:
254 idx = s.get("index")
255 if idx is None: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 continue
257 meta = s.get("metadata") or {}
258 coll = meta.get("collection_name")
259 if coll:
260 collections.setdefault(str(idx), str(coll))
261 return self._format_source_tagged_hyperlinks(
262 content, adapted, collections
263 )
264 # NUMBER_HYPERLINKS is the default and the catch-all for any
265 # mode added later that doesn't have an explicit branch above.
266 return self._format_number_hyperlinks(content, adapted)
268 def _find_sources_section(self, content: str) -> int:
269 """Find the start of the sources/references section."""
270 return find_sources_section(content)
272 def _parse_sources(
273 self, sources_content: str
274 ) -> Dict[str, Tuple[str, str]]:
275 """
276 Parse sources section to extract citation numbers, titles, and URLs.
278 Returns:
279 Dictionary mapping citation number to (title, url) tuple
280 """
281 sources = {}
282 matches = list(self.sources_pattern.finditer(sources_content))
284 for match in matches:
285 citation_nums_str = match.group(1)
286 title = match.group(2).strip()
287 url = match.group(3).strip() if match.group(3) else ""
289 # Handle comma-separated citation numbers like [36, 3]
290 # Split by comma and strip whitespace
291 individual_nums = [
292 num.strip() for num in citation_nums_str.split(",")
293 ]
295 # Add an entry for each individual number
296 for num in individual_nums:
297 sources[num] = (title, url)
299 return sources
301 def _format_number_hyperlinks(
302 self, content: str, sources: Dict[str, Tuple[str, str]]
303 ) -> str:
304 """Replace [1] with hyperlinked version where only the number is linked."""
305 # Filter sources that have URLs
306 url_sources = {
307 num: (title, url) for num, (title, url) in sources.items() if url
308 }
310 # Create formatter for citations with number hyperlinks
311 def format_number_link(citation_num, data):
312 _, url = data
313 return f"[[{citation_num}]]({url})"
315 # Handle comma-separated citations like [1, 2, 3]
316 content = self._replace_comma_citations(
317 content, url_sources, format_number_link
318 )
320 formatter = self._create_citation_formatter(
321 url_sources, format_number_link
322 )
324 # Handle individual citations
325 def replace_citation(match):
326 return (
327 formatter(match.group(1))
328 if match.group(1) in url_sources
329 else match.group(0)
330 )
332 content = self.citation_pattern.sub(replace_citation, content)
334 # Also handle "Source X" patterns
335 return self.source_word_pattern.sub(
336 self._create_source_word_replacer(formatter), content
337 )
339 def _format_domain_hyperlinks(
340 self, content: str, sources: Dict[str, Tuple[str, str]]
341 ) -> str:
342 """Replace [1] with [domain.com] hyperlinked version."""
344 # Filter sources that have URLs
345 url_sources = {
346 num: (title, url) for num, (title, url) in sources.items() if url
347 }
349 # Create formatter for citations with domain hyperlinks
350 def format_domain_link(citation_num, data):
351 _, url = data
352 domain = self._extract_domain(url)
353 return f"[[{domain}]]({url})"
355 # Handle comma-separated citations like [1, 2, 3]
356 content = self._replace_comma_citations(
357 content, url_sources, format_domain_link
358 )
360 formatter = self._create_citation_formatter(
361 url_sources, format_domain_link
362 )
364 # Handle individual citations
365 def replace_citation(match):
366 return (
367 formatter(match.group(1))
368 if match.group(1) in url_sources
369 else match.group(0)
370 )
372 content = self.citation_pattern.sub(replace_citation, content)
374 # Also handle "Source X" patterns
375 return self.source_word_pattern.sub(
376 self._create_source_word_replacer(formatter), content
377 )
379 def _format_domain_id_hyperlinks(
380 self, content: str, sources: Dict[str, Tuple[str, str]]
381 ) -> str:
382 """Replace [1] with [domain.com-1] hyperlinked version with hyphen-separated IDs."""
383 # First, create a mapping of domains to their citation numbers
384 domain_citations: dict[str, list[Any]] = {}
386 for citation_num, (title, url) in sources.items():
387 if url: 387 ↛ 386line 387 didn't jump to line 386 because the condition on line 387 was always true
388 domain = self._extract_domain(url)
389 if domain not in domain_citations:
390 domain_citations[domain] = []
391 domain_citations[domain].append((citation_num, url))
393 # Create a mapping from citation number to domain with ID
394 citation_to_domain_id = {}
395 for domain, citations in domain_citations.items():
396 if len(citations) > 1:
397 # Multiple citations from same domain - add hyphen and number
398 for idx, (citation_num, url) in enumerate(citations, 1):
399 citation_to_domain_id[citation_num] = (
400 f"{domain}-{idx}",
401 url,
402 )
403 else:
404 # Single citation from domain - no ID needed
405 citation_num, url = citations[0]
406 citation_to_domain_id[citation_num] = (domain, url)
408 # Create formatter for citations with domain_id hyperlinks
409 def format_domain_id_link(citation_num, data):
410 domain_id, url = data
411 return f"[[{domain_id}]]({url})"
413 # Handle comma-separated citations
414 content = self._replace_comma_citations(
415 content, citation_to_domain_id, format_domain_id_link
416 )
418 formatter = self._create_citation_formatter(
419 citation_to_domain_id, format_domain_id_link
420 )
422 # Handle individual citations
423 def replace_citation(match):
424 return (
425 formatter(match.group(1))
426 if match.group(1) in citation_to_domain_id
427 else match.group(0)
428 )
430 content = self.citation_pattern.sub(replace_citation, content)
432 # Also handle "Source X" patterns
433 return self.source_word_pattern.sub(
434 self._create_source_word_replacer(formatter), content
435 )
437 def _format_domain_id_always_hyperlinks(
438 self, content: str, sources: Dict[str, Tuple[str, str]]
439 ) -> str:
440 """Replace [1] with [domain.com-1] hyperlinked version, always with IDs."""
441 # First, create a mapping of domains to their citation numbers
442 domain_citations: dict[str, list[Any]] = {}
444 for citation_num, (title, url) in sources.items():
445 if url: 445 ↛ 444line 445 didn't jump to line 444 because the condition on line 445 was always true
446 domain = self._extract_domain(url)
447 if domain not in domain_citations:
448 domain_citations[domain] = []
449 domain_citations[domain].append((citation_num, url))
451 # Create a mapping from citation number to domain with ID
452 citation_to_domain_id = {}
453 for domain, citations in domain_citations.items():
454 # Always add hyphen and number for consistency
455 for idx, (citation_num, url) in enumerate(citations, 1):
456 citation_to_domain_id[citation_num] = (f"{domain}-{idx}", url)
458 # Create formatter for citations with domain_id hyperlinks
459 def format_domain_id_link(citation_num, data):
460 domain_id, url = data
461 return f"[[{domain_id}]]({url})"
463 # Handle comma-separated citations
464 content = self._replace_comma_citations(
465 content, citation_to_domain_id, format_domain_id_link
466 )
468 formatter = self._create_citation_formatter(
469 citation_to_domain_id, format_domain_id_link
470 )
472 # Handle individual citations
473 def replace_citation(match):
474 return (
475 formatter(match.group(1))
476 if match.group(1) in citation_to_domain_id
477 else match.group(0)
478 )
480 content = self.citation_pattern.sub(replace_citation, content)
482 # Also handle "Source X" patterns
483 return self.source_word_pattern.sub(
484 self._create_source_word_replacer(formatter), content
485 )
487 # Sources section may carry a "Collection: <name>" line for RAG /
488 # library hits (emitted by ``utilities/search_utilities.format_links_to_markdown``).
489 # The line sits between this ``[N]`` entry's ``URL:`` line and the
490 # next ``[N+1]`` entry. We anchor the match on a non-greedy span up
491 # to the next citation header (or end of string) to scope correctly.
492 _collection_line_pattern = re.compile(
493 r"^\[(\d+(?:,\s*\d+)*)\][^\n]*\n" # the [N] header line
494 r"(?:[^\n\[]*\n)*?" # any non-[ lines (typically URL: ...)
495 r"\s*Collection:\s*(.+?)\s*$",
496 re.MULTILINE,
497 )
499 def _parse_collections(self, sources_content: str) -> Dict[str, str]:
500 """Extract ``{citation_num: collection_name}`` from a sources
501 block. Returns an empty dict when no ``Collection:`` lines exist
502 — the absence of collection info is the common case (web URLs)
503 and must never raise."""
504 collections: Dict[str, str] = {}
505 for match in self._collection_line_pattern.finditer(sources_content):
506 citation_nums_str = match.group(1)
507 collection = match.group(2).strip()
508 if not collection: 508 ↛ 509line 508 didn't jump to line 509 because the condition on line 508 was never true
509 continue
510 for num in (n.strip() for n in citation_nums_str.split(",")):
511 collections[num] = collection
512 return collections
514 def _format_source_tagged_hyperlinks(
515 self,
516 content: str,
517 sources: Dict[str, Tuple[str, str]],
518 collections: Dict[str, str],
519 ) -> str:
520 """Replace ``[N]`` with ``[[source-N]](url)``.
522 ``source`` resolves to (in order): the RAG ``Collection:``
523 tag for library hits, the short URLClassifier tag for known
524 academic sources (``arxiv``, ``pubmed``, ...), the cleaned
525 domain otherwise, or ``local`` for empty/file URLs. ``N`` is
526 the original global citation number — labels never collide and
527 the suffix always matches the bibliography ordering.
529 Args:
530 content: Document body (sources section already split off).
531 sources: ``{citation_num: (title, url)}`` parsed from the
532 sources block.
533 collections: ``{citation_num: collection_name}`` parsed from
534 optional ``Collection:`` lines in the sources block
535 (empty dict when no library/RAG hits are cited). Wins
536 over URL-derived tags when present for a given citation.
537 """
539 def format_link(citation_num, data):
540 _, url = data
541 label = self._extract_source_label(
542 url, collection=collections.get(citation_num)
543 )
544 tag = f"{label}-{citation_num}"
545 # Only emit a hyperlink for http(s) URLs — local/file URLs are
546 # rendered as plain bracketed tags so the markdown stays clean
547 # and viewers don't try to navigate to a server-local path.
548 return (
549 f"[[{tag}]]({url})"
550 if self._is_linkable_url(url)
551 else f"[{tag}]"
552 )
554 # Handle comma-separated citations like [1, 2, 3]
555 content = self._replace_comma_citations(content, sources, format_link)
557 formatter = self._create_citation_formatter(sources, format_link)
559 # Handle individual citations
560 def replace_citation(match):
561 return (
562 formatter(match.group(1))
563 if match.group(1) in sources
564 else match.group(0)
565 )
567 content = self.citation_pattern.sub(replace_citation, content)
569 # Also handle "Source X" patterns
570 return self.source_word_pattern.sub(
571 self._create_source_word_replacer(formatter), content
572 )
574 @staticmethod
575 def _slugify_collection(name: str) -> str:
576 """Make a user-set collection name safe for inline citations.
578 Collection names are free-form strings (``"My Papers"``,
579 ``"team/finance"``). Citations need a compact token that won't
580 break markdown — strip whitespace, lowercase, replace runs of
581 non-alphanumeric chars with a single hyphen, trim leading and
582 trailing hyphens, and fall back to ``"local"`` if the result is
583 empty. ``-N`` is appended downstream so we strip trailing
584 hyphens to keep the join clean.
585 """
586 slug = re.sub(r"[^a-z0-9]+", "-", name.strip().lower()).strip("-")
587 return slug or "local"
589 @staticmethod
590 def _is_linkable_url(url: str) -> bool:
591 """Return True iff ``url`` is a http(s) URL safe to wrap in a
592 markdown hyperlink. Empty strings and file:// / local: schemes
593 are not linkable."""
594 if not url:
595 return False
596 try:
597 scheme = (urlparse(url).scheme or "").lower()
598 except (ValueError, AttributeError):
599 return False
600 return scheme in ("http", "https")
602 def _extract_source_label(
603 self, url: str, collection: str | None = None
604 ) -> str:
605 """Return a short source tag for ``url``.
607 Resolution order:
608 1. ``collection`` (when supplied) wins outright — RAG / library
609 hits surface their collection name as the citation tag
610 (``mypapers``, ``personal-notes``, ...). The renderer in
611 ``utilities/search_utilities.format_links_to_markdown``
612 emits a ``Collection:`` line per source for library results,
613 which the formatter parses back into this argument.
614 2. Empty URL or non-http(s) scheme (``file://``, ``local:``, ...) →
615 ``"local"``. Uniform fallback when no collection name is
616 available.
617 3. ``URLClassifier`` matches a known academic source → use the
618 enum value (``arxiv``, ``pubmed``, ``pmc``, ``biorxiv``,
619 ``medrxiv``, ``semantic_scholar``, ``doi``).
620 4. Otherwise → fall back to ``_extract_domain`` (e.g.
621 ``arxiv.org``, ``nytimes.com``).
622 """
623 if collection:
624 return self._slugify_collection(collection)
625 if not url:
626 return "local"
627 try:
628 parsed = urlparse(url)
629 except (ValueError, AttributeError):
630 return "local"
631 scheme = (parsed.scheme or "").lower()
632 if scheme not in ("http", "https"):
633 return "local"
635 # Lazy import to keep the formatter usable when the content_fetcher
636 # package isn't importable (e.g. minimal test setups).
637 try:
638 from ..content_fetcher.url_classifier import URLClassifier, URLType
639 except ImportError:
640 return self._extract_domain(url)
642 url_type = URLClassifier.classify(url)
643 # Generic HTML/PDF/INVALID → fall back to domain. Everything else
644 # is a known academic source whose enum value is the short tag.
645 if url_type in (URLType.HTML, URLType.PDF, URLType.INVALID):
646 return self._extract_domain(url)
647 return url_type.value
649 def _extract_domain(self, url: str) -> str:
650 """Extract domain name from URL."""
651 try:
652 parsed = urlparse(url)
653 domain = parsed.netloc
654 # Remove www. prefix if present
655 if domain.startswith("www."):
656 domain = domain[4:]
657 # Keep known domains as-is
658 known_domains = {
659 "arxiv.org": "arxiv.org",
660 "github.com": "github.com",
661 "reddit.com": "reddit.com",
662 "youtube.com": "youtube.com",
663 "pypi.org": "pypi.org",
664 "milvus.io": "milvus.io",
665 "medium.com": "medium.com",
666 }
668 for known, display in known_domains.items():
669 if known in domain:
670 return display
672 # For other domains, extract main domain
673 parts = domain.split(".")
674 if len(parts) >= 2:
675 return ".".join(parts[-2:])
676 return domain
677 except (ValueError, AttributeError):
678 return "source"
681class QuartoExporter:
682 """Export markdown documents to Quarto (.qmd) format."""
684 def __init__(self):
685 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
686 self.citation_pattern = re.compile(
687 r"(?<![\[【])[\[【](\d+)[\]】](?![\]】])"
688 )
689 self.comma_citation_pattern = re.compile(
690 r"[\[【](\d+(?:,\s*\d+)+)[\]】]"
691 )
693 def export_to_quarto(self, content: str, title: str | None = None) -> str:
694 """
695 Convert markdown document to Quarto format.
697 Args:
698 content: Markdown content
699 title: Document title (if None, will extract from content)
701 Returns:
702 Quarto formatted content
703 """
704 # Extract title from markdown if not provided
705 if not title:
706 title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
707 title = title_match.group(1) if title_match else "Research Report"
709 # Create Quarto YAML header
710 from datetime import datetime, UTC
712 current_date = datetime.now(UTC).strftime("%Y-%m-%d")
713 yaml_header = f"""---
714title: "{title}"
715author: "Local Deep Research"
716date: "{current_date}"
717format:
718 html:
719 toc: true
720 toc-depth: 3
721 number-sections: true
722 pdf:
723 toc: true
724 number-sections: true
725 colorlinks: true
726bibliography: references.bib
727csl: apa.csl
728---
730"""
732 # Process content
733 processed_content = content
735 # First handle comma-separated citations like [1, 2, 3]
736 def replace_comma_citations(match):
737 citation_nums = match.group(1)
738 # Split by comma and strip whitespace
739 nums = [num.strip() for num in citation_nums.split(",")]
740 refs = [f"@ref{num}" for num in nums]
741 return f"[{', '.join(refs)}]"
743 processed_content = self.comma_citation_pattern.sub(
744 replace_comma_citations, processed_content
745 )
747 # Then convert individual citations to Quarto format [@citation]
748 def replace_citation(match):
749 citation_num = match.group(1)
750 return f"[@ref{citation_num}]"
752 processed_content = self.citation_pattern.sub(
753 replace_citation, processed_content
754 )
756 # Generate bibliography file content
757 bib_content = self._generate_bibliography(content)
759 # Add note about bibliography file
760 bibliography_note = (
761 "\n\n::: {.callout-note}\n## Bibliography File Required\n\nThis document requires a `references.bib` file in the same directory with the following content:\n\n```bibtex\n"
762 + bib_content
763 + "\n```\n:::\n"
764 )
766 return yaml_header + processed_content + bibliography_note
768 def _generate_bibliography(self, content: str) -> str:
769 """Generate BibTeX bibliography from sources."""
770 sources_pattern = re.compile(
771 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
772 )
774 bibliography = ""
775 matches = list(sources_pattern.finditer(content))
777 for match in matches:
778 citation_num = match.group(1)
779 title = match.group(2).strip()
780 url = match.group(3).strip() if match.group(3) else ""
782 # Generate BibTeX entry
783 bib_entry = f"@misc{{ref{citation_num},\n"
784 bib_entry += f' title = "{{{title}}}",\n'
785 if url:
786 bib_entry += f" url = {{{url}}},\n"
787 bib_entry += f' howpublished = "\\url{{{url}}}",\n'
788 bib_entry += f" year = {{{2024}}},\n"
789 bib_entry += ' note = "Accessed: \\today"\n'
790 bib_entry += "}\n"
792 bibliography += bib_entry + "\n"
794 return bibliography.strip()
797class RISExporter:
798 """Export references to RIS format for reference managers like Zotero."""
800 def __init__(self):
801 self.sources_pattern = re.compile(
802 r"^\[(\d+(?:,\s*\d+)*)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$",
803 re.MULTILINE,
804 )
806 def export_to_ris(self, content: str) -> str:
807 """
808 Extract references from markdown and convert to RIS format.
810 Args:
811 content: Markdown content with sources
813 Returns:
814 RIS formatted references
815 """
816 # Find sources section
817 sources_start = find_sources_section(content)
818 if sources_start == -1:
819 return ""
821 # Find the end of the first sources section (before any other major section)
822 sources_content = content[sources_start:]
824 # Look for the next major section to avoid duplicates
825 next_section_markers = [
826 "\n## ALL SOURCES",
827 "\n### ALL SOURCES",
828 "\n## Research Metrics",
829 "\n### Research Metrics",
830 "\n## SEARCH QUESTIONS",
831 "\n### SEARCH QUESTIONS",
832 "\n## DETAILED FINDINGS",
833 "\n### DETAILED FINDINGS",
834 "\n---", # Horizontal rule often separates sections
835 ]
837 sources_end = len(sources_content)
838 for marker in next_section_markers:
839 pos = sources_content.find(marker)
840 if pos != -1 and pos < sources_end:
841 sources_end = pos
843 sources_content = sources_content[:sources_end]
845 # Parse sources and generate RIS entries
846 ris_entries = []
847 seen_refs = set() # Track which references we've already processed
849 # Split sources into individual entries
850 import re
852 # Pattern to match each source entry. Accept both ASCII "[N]" and
853 # lenticular "【N】" openers/closers — the inline citation patterns
854 # in this file already handle lenticular brackets (some LLMs emit
855 # them), so the source-list parser must stay consistent or it would
856 # silently drop lenticular-bracketed source entries.
857 source_entry_pattern = re.compile(
858 r"^[\[【](\d+)[\]】]\s*(.+?)(?=^[\[【]\d+[\]】]|\Z)",
859 re.MULTILINE | re.DOTALL,
860 )
862 for match in source_entry_pattern.finditer(sources_content):
863 citation_num = match.group(1)
864 entry_text = match.group(2).strip()
866 # Extract the title (first line)
867 lines = entry_text.split("\n")
868 title = lines[0].strip()
870 # Extract URL, DOI, and other metadata from subsequent lines
871 url = ""
872 metadata = {}
873 for line in lines[1:]:
874 line = line.strip()
875 if line.startswith("URL:"):
876 url = line[4:].strip()
877 elif line.startswith("DOI:"):
878 metadata["doi"] = line[4:].strip()
879 elif line.startswith("Published in"):
880 metadata["journal"] = line[12:].strip()
881 # Add more metadata parsing as needed
882 elif line: 882 ↛ 873line 882 didn't jump to line 873 because the condition on line 882 was always true
883 # Store other lines as additional metadata
884 if "additional" not in metadata: 884 ↛ 886line 884 didn't jump to line 886 because the condition on line 884 was always true
885 metadata["additional"] = []
886 additional = metadata["additional"]
887 if isinstance(additional, list): 887 ↛ 873line 887 didn't jump to line 873 because the condition on line 887 was always true
888 additional.append(line)
890 # Combine title with additional metadata lines for full context
891 full_text = entry_text
893 # Create a unique key to avoid duplicates
894 ref_key = (citation_num, title, url)
895 if ref_key not in seen_refs: 895 ↛ 862line 895 didn't jump to line 862 because the condition on line 895 was always true
896 seen_refs.add(ref_key)
897 # Create RIS entry with full text for metadata extraction
898 ris_entry = self._create_ris_entry(
899 citation_num, full_text, url, metadata
900 )
901 ris_entries.append(ris_entry)
903 return "\n".join(ris_entries)
905 def _create_ris_entry(
906 self,
907 ref_id: str,
908 full_text: str,
909 url: str = "",
910 metadata: dict | None = None,
911 ) -> str:
912 """Create a single RIS entry."""
913 lines = []
915 # Parse metadata from full text
916 import re
918 if metadata is None:
919 metadata = {}
921 # Extract title from first line
922 lines = full_text.split("\n")
923 title = lines[0].strip()
925 # Extract year from full text (looks for 4-digit year)
926 year_match = re.search(r"\b(19\d{2}|20\d{2})\b", full_text)
927 year = year_match.group(1) if year_match else None
929 # Extract authors if present (looks for "by Author1, Author2")
930 authors_match = re.search(
931 r"\bby\s+([^.\n]+?)(?:\.|\n|$)", full_text, re.IGNORECASE
932 )
933 authors = []
934 if authors_match:
935 authors_text = authors_match.group(1)
936 # Split by 'and' or ','
937 author_parts = re.split(r"\s*(?:,|\sand\s|&)\s*", authors_text)
938 authors = [a.strip() for a in author_parts if a.strip()]
940 # Extract DOI from metadata or text
941 doi = metadata.get("doi")
942 if not doi:
943 doi_match = re.search(
944 r"DOI:\s*([^\s\n]+)", full_text, re.IGNORECASE
945 )
946 doi = doi_match.group(1) if doi_match else None
948 # Clean title - remove author and metadata info for cleaner title
949 clean_title = title
950 if authors_match and authors_match.start() < len(title):
951 clean_title = (
952 title[: authors_match.start()] + title[authors_match.end() :]
953 if authors_match.end() < len(title)
954 else title[: authors_match.start()]
955 )
956 clean_title = re.sub(
957 r"\s*DOI:\s*[^\s]+", "", clean_title, flags=re.IGNORECASE
958 )
959 clean_title = re.sub(
960 r"\s*Published in.*", "", clean_title, flags=re.IGNORECASE
961 )
962 clean_title = re.sub(
963 r"\s*Volume.*", "", clean_title, flags=re.IGNORECASE
964 )
965 clean_title = re.sub(
966 r"\s*Pages.*", "", clean_title, flags=re.IGNORECASE
967 )
968 clean_title = clean_title.strip()
970 # TY - Type of reference (ELEC for electronic source/website)
971 lines.append("TY - ELEC")
973 # ID - Reference ID
974 lines.append(f"ID - ref{ref_id}")
976 # TI - Title
977 lines.append(f"TI - {clean_title if clean_title else title}")
979 # AU - Authors
980 for author in authors:
981 lines.append(f"AU - {author}")
983 # DO - DOI
984 if doi:
985 lines.append(f"DO - {doi}")
987 # PY - Publication year (if found in title)
988 if year:
989 lines.append(f"PY - {year}")
991 # UR - URL
992 if url:
993 lines.append(f"UR - {url}")
995 # Try to extract domain as publisher
996 try:
997 from urllib.parse import urlparse
999 parsed = urlparse(url)
1000 domain = parsed.netloc
1001 if domain.startswith("www."):
1002 domain = domain[4:]
1003 # Extract readable publisher name from domain
1004 if domain == "github.com" or domain.endswith(".github.com"):
1005 lines.append("PB - GitHub")
1006 elif domain == "arxiv.org" or domain.endswith(".arxiv.org"):
1007 lines.append("PB - arXiv")
1008 elif domain == "reddit.com" or domain.endswith(".reddit.com"):
1009 lines.append("PB - Reddit")
1010 elif (
1011 domain == "youtube.com"
1012 or domain == "m.youtube.com"
1013 or domain.endswith(".youtube.com")
1014 ):
1015 lines.append("PB - YouTube")
1016 elif domain == "medium.com" or domain.endswith(".medium.com"):
1017 lines.append("PB - Medium")
1018 elif domain == "pypi.org" or domain.endswith(".pypi.org"):
1019 lines.append("PB - Python Package Index (PyPI)")
1020 else:
1021 # Use domain as publisher
1022 lines.append(f"PB - {domain}")
1023 except (ValueError, AttributeError):
1024 pass
1026 # Y1 - Year accessed (current year)
1027 from datetime import datetime, UTC
1029 current_year = datetime.now(UTC).year
1030 lines.append(f"Y1 - {current_year}")
1032 # DA - Date accessed
1033 current_date = datetime.now(UTC).strftime("%Y/%m/%d")
1034 lines.append(f"DA - {current_date}")
1036 # LA - Language
1037 lines.append("LA - en")
1039 # ER - End of reference
1040 lines.append("ER - ")
1042 return "\n".join(lines)
1045class LaTeXExporter:
1046 """Export markdown documents to LaTeX format."""
1048 def __init__(self):
1049 # Also match Unicode lenticular brackets 【】 (U+3010 and U+3011) that LLMs sometimes generate
1050 self.citation_pattern = re.compile(r"[\[【](\d+)[\]】]")
1051 self.heading_patterns = [
1052 (re.compile(r"^# (.+)$", re.MULTILINE), r"\\section{\1}"),
1053 (re.compile(r"^## (.+)$", re.MULTILINE), r"\\subsection{\1}"),
1054 (re.compile(r"^### (.+)$", re.MULTILINE), r"\\subsubsection{\1}"),
1055 ]
1056 self.emphasis_patterns = [
1057 (re.compile(r"\*\*(.+?)\*\*"), r"\\textbf{\1}"),
1058 (re.compile(r"\*(.+?)\*"), r"\\textit{\1}"),
1059 (re.compile(r"`(.+?)`"), r"\\texttt{\1}"),
1060 ]
1062 def export_to_latex(self, content: str) -> str:
1063 """
1064 Convert markdown document to LaTeX format.
1066 Args:
1067 content: Markdown content
1069 Returns:
1070 LaTeX formatted content
1071 """
1072 latex_content = self._create_latex_header()
1074 # Convert markdown to LaTeX
1075 body_content = content
1077 # Escape special LaTeX characters but preserve math mode
1078 # Split by $ to preserve math sections
1079 parts = body_content.split("$")
1080 for i in range(len(parts)):
1081 # Even indices are outside math mode
1082 if i % 2 == 0:
1083 # Only escape if not inside $$
1084 if not (
1085 i > 0
1086 and parts[i - 1] == ""
1087 and i < len(parts) - 1
1088 and parts[i + 1] == ""
1089 ):
1090 # Preserve certain patterns that will be processed later
1091 # like headings (#), emphasis (*), and citations ([n])
1092 lines = parts[i].split("\n")
1093 for j, line in enumerate(lines):
1094 # Don't escape lines that start with # (headings)
1095 if not line.strip().startswith("#"):
1096 # Don't escape emphasis markers or citations for now
1097 # They'll be handled by their own patterns
1098 temp_line = line
1099 # Escape special chars except *, #, [, ]
1100 temp_line = temp_line.replace("&", r"\&")
1101 temp_line = temp_line.replace("%", r"\%")
1102 temp_line = temp_line.replace("_", r"\_")
1103 # Don't escape { } inside citations
1104 lines[j] = temp_line
1105 parts[i] = "\n".join(lines)
1106 body_content = "$".join(parts)
1108 # Convert headings
1109 for pattern, replacement in self.heading_patterns:
1110 body_content = pattern.sub(replacement, body_content)
1112 # Convert emphasis
1113 for pattern, replacement in self.emphasis_patterns:
1114 body_content = pattern.sub(replacement, body_content)
1116 # Convert citations to LaTeX \cite{} format
1117 body_content = self.citation_pattern.sub(r"\\cite{\1}", body_content)
1119 # Convert lists
1120 body_content = self._convert_lists(body_content)
1122 # Add body content
1123 latex_content += body_content
1125 # Add bibliography section
1126 latex_content += self._create_bibliography(content)
1128 # Add footer
1129 latex_content += self._create_latex_footer()
1131 return latex_content
1133 def _create_latex_header(self) -> str:
1134 """Create LaTeX document header."""
1135 return r"""\documentclass[12pt]{article}
1136\usepackage[utf8]{inputenc}
1137\usepackage{hyperref}
1138\usepackage{cite}
1139\usepackage{url}
1141\title{Research Report}
1142\date{\today}
1144\begin{document}
1145\maketitle
1147"""
1149 def _create_latex_footer(self) -> str:
1150 """Create LaTeX document footer."""
1151 return "\n\\end{document}\n"
1153 def _escape_latex(self, text: str) -> str:
1154 """Escape special LaTeX characters in text."""
1155 # Escape special LaTeX characters
1156 replacements = [
1157 ("\\", r"\textbackslash{}"), # Must be first
1158 ("&", r"\&"),
1159 ("%", r"\%"),
1160 ("$", r"\$"),
1161 ("#", r"\#"),
1162 ("_", r"\_"),
1163 ("{", r"\{"),
1164 ("}", r"\}"),
1165 ("~", r"\textasciitilde{}"),
1166 ("^", r"\textasciicircum{}"),
1167 ]
1169 for old, new in replacements:
1170 text = text.replace(old, new)
1172 return text
1174 def _convert_lists(self, content: str) -> str:
1175 """Convert markdown lists to LaTeX format."""
1176 # Simple conversion for bullet points
1177 content = re.sub(r"^- (.+)$", r"\\item \1", content, flags=re.MULTILINE)
1179 # Add itemize environment around list items
1180 lines = content.split("\n")
1181 result = []
1182 in_list = False
1184 for line in lines:
1185 if line.strip().startswith("\\item"):
1186 if not in_list:
1187 result.append("\\begin{itemize}")
1188 in_list = True
1189 result.append(line)
1190 else:
1191 if in_list and line.strip():
1192 result.append("\\end{itemize}")
1193 in_list = False
1194 result.append(line)
1196 if in_list:
1197 result.append("\\end{itemize}")
1199 return "\n".join(result)
1201 def _create_bibliography(self, content: str) -> str:
1202 """Extract sources and create LaTeX bibliography."""
1203 sources_start = find_sources_section(content)
1204 if sources_start == -1:
1205 return ""
1207 sources_content = content[sources_start:]
1208 pattern = re.compile(
1209 r"^\[(\d+)\]\s*(.+?)(?:\n\s*URL:\s*(.+?))?$", re.MULTILINE
1210 )
1212 bibliography = "\n\\begin{thebibliography}{99}\n"
1214 for match in pattern.finditer(sources_content):
1215 citation_num = match.group(1)
1216 title = match.group(2).strip()
1217 url = match.group(3).strip() if match.group(3) else ""
1219 # Escape special LaTeX characters in title
1220 escaped_title = self._escape_latex(title)
1222 if url:
1223 bibliography += f"\\bibitem{{{citation_num}}} {escaped_title}. \\url{{{url}}}\n"
1224 else:
1225 bibliography += (
1226 f"\\bibitem{{{citation_num}}} {escaped_title}.\n"
1227 )
1229 bibliography += "\\end{thebibliography}\n"
1231 return bibliography