Coverage for src/local_deep_research/utilities/citation_normalizer.py: 87%
169 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Normalize search engine result dicts into structured citation metadata.
4Converts engine-specific field names and formats into CSL-JSON vocabulary.
5Each engine has its own dict shape; this module handles the differences.
6"""
8import re
9from datetime import date
10from typing import Any, Optional
13__all__ = [
14 "normalize_citation",
15 "normalize_issn",
16 "detect_engine",
17]
20_ISSN_CHARS = re.compile(r"[^0-9Xx]")
23def normalize_issn(s: Optional[str]) -> Optional[str]:
24 """Canonicalize an ISSN to the 8-character no-dash form.
26 Strips dashes, whitespace, and any non-digit/non-X characters; uppercases
27 a trailing "x" check digit. Returns the canonical 8-char form, or None if
28 the input is missing or cannot be coerced into 8 characters.
30 Examples:
31 normalize_issn("1522-9645") == "15229645"
32 normalize_issn("15229645") == "15229645"
33 normalize_issn("1234-567x") == "1234567X"
34 normalize_issn("bad") is None
35 normalize_issn(None) is None
37 This is a structural canonicalization — it does not verify the ISSN
38 checksum. The goal is format-independent equality for lookup.
39 """
40 if not s:
41 return None
42 cleaned = _ISSN_CHARS.sub("", s).upper()
43 if len(cleaned) != 8:
44 return None
45 return cleaned
48# Academic source engines that produce citation-worthy metadata
49ACADEMIC_ENGINES = {
50 "arxiv",
51 "openalex",
52 "semantic_scholar",
53 "pubmed",
54 "nasa_ads",
55}
57# URL patterns to detect source engine from URLs
58_ENGINE_PATTERNS = [
59 (re.compile(r"arxiv\.org"), "arxiv"),
60 (re.compile(r"openalex\.org"), "openalex"),
61 (re.compile(r"semanticscholar\.org"), "semantic_scholar"),
62 (re.compile(r"ncbi\.nlm\.nih\.gov|pubmed"), "pubmed"),
63 (re.compile(r"adsabs\.harvard\.edu|ui\.adsabs"), "nasa_ads"),
64 (re.compile(r"doi\.org"), "doi"),
65]
68def detect_engine(source: dict) -> Optional[str]:
69 """Detect which search engine produced this result.
71 Checks explicit source_engine field first, then URL patterns.
72 Returns None for non-academic sources (web, news, etc.).
73 """
74 # Explicit engine field
75 engine = source.get("source_engine") or source.get("source")
76 if engine:
77 engine_lower = engine.lower().strip()
78 if engine_lower in ACADEMIC_ENGINES: 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true
79 return engine_lower
81 # Detect from URL
82 url = source.get("link", "") or source.get("url", "")
83 for pattern, engine_name in _ENGINE_PATTERNS:
84 if pattern.search(url):
85 return engine_name
87 return None
90def _parse_authors_list(authors: Any) -> Optional[list[dict]]:
91 """Convert various author formats to CSL-JSON name objects.
93 Handles:
94 - List of strings: ["John Smith", "Jane Doe"]
95 - Comma-separated string: "John Smith, Jane Doe"
96 - List of dicts with "name": [{"name": "John Smith"}]
97 - Already CSL format: [{"family": "Smith", "given": "John"}]
98 """
99 if not authors:
100 return None
102 if isinstance(authors, str):
103 authors = [a.strip() for a in authors.split(",") if a.strip()]
105 if not isinstance(authors, list): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 return None
108 result = []
109 for author in authors:
110 if isinstance(author, dict):
111 if "family" in author:
112 # Whitelist only CSL name fields to ensure the dict is
113 # JSON-serializable. Engines like OpenAlex and
114 # Semantic Scholar attach nested affiliation objects,
115 # ORCIDs, etc., which may contain non-primitive types
116 # that would crash json.dumps() when stored in the
117 # paper_metadata JSON column.
118 safe = {"family": author["family"]}
119 if "given" in author: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true
120 safe["given"] = author["given"]
121 if "suffix" in author: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 safe["suffix"] = author["suffix"]
123 result.append(safe)
124 elif "name" in author: 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was always true
125 result.append(_parse_name(author["name"]))
126 elif "display_name" in author:
127 result.append(_parse_name(author["display_name"]))
128 elif isinstance(author, str): 128 ↛ 109line 128 didn't jump to line 109 because the condition on line 128 was always true
129 result.append(_parse_name(author))
131 return result if result else None
134def _parse_name(name: str) -> dict:
135 """Parse a name string into CSL {"family", "given"} format."""
136 name = name.strip()
137 if not name: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true
138 return {"literal": ""}
140 # Handle "Last, First" format
141 if "," in name:
142 parts = name.split(",", 1)
143 return {"family": parts[0].strip(), "given": parts[1].strip()}
145 # Handle "First Last" format
146 parts = name.rsplit(" ", 1)
147 if len(parts) == 2:
148 return {"given": parts[0].strip(), "family": parts[1].strip()}
150 return {"literal": name}
153def _parse_date(source: dict) -> tuple[Optional[date], Optional[int]]:
154 """Extract publication date and year from various formats.
156 Returns (date_obj, year_int). Either may be None.
157 """
158 year = None
160 # Try explicit year field
161 raw_year = source.get("year") or source.get("publication_year")
162 if raw_year:
163 try:
164 year = int(raw_year)
165 except (ValueError, TypeError):
166 pass
168 # Try date string
169 date_str = (
170 source.get("publication_date")
171 or source.get("published")
172 or source.get("date")
173 or source.get("pubdate")
174 )
175 if date_str and isinstance(date_str, str):
176 # Try ISO format: YYYY-MM-DD
177 match = re.match(r"(\d{4})-(\d{1,2})-(\d{1,2})", date_str)
178 if match: 178 ↛ 187line 178 didn't jump to line 187 because the condition on line 178 was always true
179 try:
180 d = date(int(match[1]), int(match[2]), int(match[3]))
181 if year is None:
182 year = d.year
183 return d, year
184 except ValueError:
185 pass
186 # Try just year
187 if year is None:
188 match = re.match(r"(\d{4})", date_str)
189 if match:
190 year = int(match[1])
192 return None, year
195def _extract_arxiv_id(source: dict) -> Optional[str]:
196 """Extract arXiv ID from URL or explicit field."""
197 arxiv_id = source.get("arxiv_id")
198 if arxiv_id:
199 return arxiv_id
201 url = source.get("link", "") or source.get("url", "")
202 # Old-style (pre-Apr 2007): archive(.subject-class)?/YYMMNNN, e.g.
203 # cond-mat/0501001, math.AG/0601001, hep-th/9802150.
204 # New-style: YYMM.NNNN or YYMM.NNNNN (5-digit seq from 2015 onwards),
205 # with optional vN version suffix, e.g. 2501.12345, 0704.0001v2.
206 match = re.search(
207 r"arxiv\.org/abs/((?:[a-z-]+(?:\.[A-Z]+)?/\d{7}|\d{4}\.\d{4,5})(?:v\d+)?)",
208 url,
209 )
210 if match: 210 ↛ 212line 210 didn't jump to line 212 because the condition on line 210 was always true
211 return match.group(1)
212 return None
215def _extract_doi(source: dict) -> Optional[str]:
216 """Extract a DOI string from a result/source dict.
218 Tries (in order):
219 1. ``source["doi"]`` — set by ArXiv, NASA ADS, OpenAlex
220 2. ``source["external_ids"]["DOI"]`` / ``externalIds["DOI"]`` — Semantic
221 Scholar style
222 3. A DOI embedded in ``source["link"]`` (https://doi.org/...)
224 URL prefixes (``https://doi.org/``, ``http://doi.org/``, ``doi:``) are
225 stripped so the returned value is a bare DOI like ``10.1038/...``. This
226 is the single source of truth for DOI extraction across the codebase —
227 `openalex_enrichment` and other modules import it from here.
228 """
229 doi = source.get("doi")
230 if isinstance(doi, list):
231 doi = doi[0] if doi else None
233 # Semantic Scholar exposes DOIs through external_ids / externalIds
234 if not doi:
235 ext_ids = source.get("external_ids") or source.get("externalIds") or {}
236 if isinstance(ext_ids, dict): 236 ↛ 242line 236 didn't jump to line 242 because the condition on line 236 was always true
237 doi = ext_ids.get("DOI") or ext_ids.get("doi")
239 # DOI embedded in a link URL — anchor on scheme to avoid CodeQL
240 # py/incomplete-url-substring-sanitization (an attacker-controlled
241 # URL path could contain "doi.org/" otherwise).
242 if not doi:
243 link = source.get("link") or ""
244 if isinstance(link, str): 244 ↛ 255line 244 didn't jump to line 255 because the condition on line 244 was always true
245 for prefix in (
246 "https://doi.org/",
247 "http://doi.org/",
248 "https://dx.doi.org/",
249 "http://dx.doi.org/",
250 ):
251 if link.startswith(prefix):
252 doi = link[len(prefix) :]
253 break
255 if not doi:
256 return None
258 doi = str(doi)
259 for prefix in (
260 "https://doi.org/",
261 "http://doi.org/",
262 "https://dx.doi.org/",
263 "http://dx.doi.org/",
264 "doi:",
265 ):
266 if doi.startswith(prefix):
267 doi = doi[len(prefix) :]
268 break
269 return doi if doi else None
272def normalize_citation(source: dict) -> Optional[dict]:
273 """Normalize a search result dict into citation metadata fields.
275 Returns a dict of field values ready for Paper creation,
276 or None if the source is not an academic result.
278 The returned dict has keys matching Paper column names.
279 """
280 engine = detect_engine(source)
281 if engine is None:
282 return None
284 pub_date, year = _parse_date(source)
285 doi = _extract_doi(source)
287 # Guard against source["metadata"] being a non-dict truthy value
288 # (e.g., a string) — .get({}).get() crashes with AttributeError
289 # in that case because the default only applies when the key is
290 # absent/None.
291 nested_meta = source.get("metadata")
292 if not isinstance(nested_meta, dict): 292 ↛ 298line 292 didn't jump to line 298 because the condition on line 292 was always true
293 nested_meta = {}
295 # Prefer a structured CSL list (e.g. NASA ADS publishes "Last, First"
296 # names which lose their pairing if re-split from a comma-joined
297 # display string) over the display-string fallback.
298 authors_input = (
299 source.get("authors_csl")
300 or nested_meta.get("authors_csl")
301 or source.get("authors")
302 or nested_meta.get("authors")
303 )
304 authors = _parse_authors_list(authors_input)
306 # Container title (journal/conference name).
307 # Checks CSL-style ``container_title`` and ``container-title`` too
308 # so callers that already use CSL vocabulary don't have their
309 # journal silently dropped.
310 container = (
311 source.get("journal_ref")
312 or source.get("journal")
313 or source.get("venue")
314 or source.get("container_title")
315 or source.get("container-title")
316 or nested_meta.get("journal")
317 )
318 # Placeholder sentinels from upstream engines (e.g. OpenAlex /
319 # NASA ADS set these when no venue is indexed). Filter them out
320 # so they don't become container_title literals — there's an
321 # actual OpenAlex source named "unknown" (Q1, h_index=5) that
322 # would otherwise get hit by the name-based lookup.
323 if isinstance(container, str) and container.strip().lower() in (
324 "unknown",
325 "",
326 ):
327 container = None
329 # Item type (CSL vocabulary)
330 item_type = _infer_item_type(engine, source)
332 fields = {
333 "source_engine": engine,
334 "doi": doi,
335 "arxiv_id": _extract_arxiv_id(source) if engine == "arxiv" else None,
336 "pmid": source.get("pmid"),
337 "pmcid": source.get("pmcid"),
338 "authors": authors,
339 "publication_date": pub_date,
340 "year": year,
341 "volume": source.get("volume"),
342 "issue": source.get("issue"),
343 "pages": source.get("pages"),
344 "container_title": container,
345 "publisher": source.get("publisher"),
346 "item_type": item_type,
347 }
349 # Build CSL-JSON item (uses the date object while still native)
350 fields["csl_json"] = _build_csl_json(source, fields)
352 # Convert publication_date to ISO string so it survives JSON
353 # serialization when stored in paper_metadata. _build_csl_json has
354 # already consumed the native date object above.
355 if fields.get("publication_date") is not None:
356 fields["publication_date"] = fields["publication_date"].isoformat()
358 # Strip None values to avoid unnecessary DB writes
359 return {k: v for k, v in fields.items() if v is not None}
362def _infer_item_type(engine: str, source: dict) -> str:
363 """Infer CSL item type from engine and source metadata."""
364 source_type = source.get("source_type", "")
365 if source_type == "conference": 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true
366 return "paper-conference"
368 if engine == "arxiv":
369 # ArXiv papers with journal_ref are published; without are preprints
370 return "article-journal" if source.get("journal_ref") else "article"
372 return "article-journal"
375def _build_csl_json(source: dict, fields: dict) -> dict:
376 """Build a CSL-JSON item from normalized fields."""
377 csl: dict[str, Any] = {
378 "type": fields.get("item_type", "article-journal"),
379 "title": source.get("title", ""),
380 }
382 if fields.get("authors"):
383 csl["author"] = fields["authors"]
385 if fields.get("container_title"):
386 csl["container-title"] = fields["container_title"]
388 if fields.get("doi"):
389 csl["DOI"] = fields["doi"]
391 if fields.get("volume"):
392 csl["volume"] = fields["volume"]
393 if fields.get("issue"):
394 csl["issue"] = fields["issue"]
395 if fields.get("pages"):
396 csl["page"] = fields["pages"]
398 if fields.get("publisher"): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 csl["publisher"] = fields["publisher"]
401 if fields.get("year"):
402 date_parts = [[fields["year"]]]
403 if fields.get("publication_date"):
404 d = fields["publication_date"]
405 date_parts = [[d.year, d.month, d.day]]
406 csl["issued"] = {"date-parts": date_parts}
408 url = source.get("link") or source.get("url")
409 if url:
410 csl["URL"] = url
412 return csl