Coverage for src/local_deep_research/utilities/citation_normalizer.py: 87%

169 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Normalize search engine result dicts into structured citation metadata. 

3 

4Converts engine-specific field names and formats into CSL-JSON vocabulary. 

5Each engine has its own dict shape; this module handles the differences. 

6""" 

7 

8import re 

9from datetime import date 

10from typing import Any, Optional 

11 

12 

13__all__ = [ 

14 "normalize_citation", 

15 "normalize_issn", 

16 "detect_engine", 

17] 

18 

19 

20_ISSN_CHARS = re.compile(r"[^0-9Xx]") 

21 

22 

23def normalize_issn(s: Optional[str]) -> Optional[str]: 

24 """Canonicalize an ISSN to the 8-character no-dash form. 

25 

26 Strips dashes, whitespace, and any non-digit/non-X characters; uppercases 

27 a trailing "x" check digit. Returns the canonical 8-char form, or None if 

28 the input is missing or cannot be coerced into 8 characters. 

29 

30 Examples: 

31 normalize_issn("1522-9645") == "15229645" 

32 normalize_issn("15229645") == "15229645" 

33 normalize_issn("1234-567x") == "1234567X" 

34 normalize_issn("bad") is None 

35 normalize_issn(None) is None 

36 

37 This is a structural canonicalization — it does not verify the ISSN 

38 checksum. The goal is format-independent equality for lookup. 

39 """ 

40 if not s: 

41 return None 

42 cleaned = _ISSN_CHARS.sub("", s).upper() 

43 if len(cleaned) != 8: 

44 return None 

45 return cleaned 

46 

47 

48# Academic source engines that produce citation-worthy metadata 

49ACADEMIC_ENGINES = { 

50 "arxiv", 

51 "openalex", 

52 "semantic_scholar", 

53 "pubmed", 

54 "nasa_ads", 

55} 

56 

57# URL patterns to detect source engine from URLs 

58_ENGINE_PATTERNS = [ 

59 (re.compile(r"arxiv\.org"), "arxiv"), 

60 (re.compile(r"openalex\.org"), "openalex"), 

61 (re.compile(r"semanticscholar\.org"), "semantic_scholar"), 

62 (re.compile(r"ncbi\.nlm\.nih\.gov|pubmed"), "pubmed"), 

63 (re.compile(r"adsabs\.harvard\.edu|ui\.adsabs"), "nasa_ads"), 

64 (re.compile(r"doi\.org"), "doi"), 

65] 

66 

67 

68def detect_engine(source: dict) -> Optional[str]: 

69 """Detect which search engine produced this result. 

70 

71 Checks explicit source_engine field first, then URL patterns. 

72 Returns None for non-academic sources (web, news, etc.). 

73 """ 

74 # Explicit engine field 

75 engine = source.get("source_engine") or source.get("source") 

76 if engine: 

77 engine_lower = engine.lower().strip() 

78 if engine_lower in ACADEMIC_ENGINES: 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true

79 return engine_lower 

80 

81 # Detect from URL 

82 url = source.get("link", "") or source.get("url", "") 

83 for pattern, engine_name in _ENGINE_PATTERNS: 

84 if pattern.search(url): 

85 return engine_name 

86 

87 return None 

88 

89 

90def _parse_authors_list(authors: Any) -> Optional[list[dict]]: 

91 """Convert various author formats to CSL-JSON name objects. 

92 

93 Handles: 

94 - List of strings: ["John Smith", "Jane Doe"] 

95 - Comma-separated string: "John Smith, Jane Doe" 

96 - List of dicts with "name": [{"name": "John Smith"}] 

97 - Already CSL format: [{"family": "Smith", "given": "John"}] 

98 """ 

99 if not authors: 

100 return None 

101 

102 if isinstance(authors, str): 

103 authors = [a.strip() for a in authors.split(",") if a.strip()] 

104 

105 if not isinstance(authors, list): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 return None 

107 

108 result = [] 

109 for author in authors: 

110 if isinstance(author, dict): 

111 if "family" in author: 

112 # Whitelist only CSL name fields to ensure the dict is 

113 # JSON-serializable. Engines like OpenAlex and 

114 # Semantic Scholar attach nested affiliation objects, 

115 # ORCIDs, etc., which may contain non-primitive types 

116 # that would crash json.dumps() when stored in the 

117 # paper_metadata JSON column. 

118 safe = {"family": author["family"]} 

119 if "given" in author: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 safe["given"] = author["given"] 

121 if "suffix" in author: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 safe["suffix"] = author["suffix"] 

123 result.append(safe) 

124 elif "name" in author: 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was always true

125 result.append(_parse_name(author["name"])) 

126 elif "display_name" in author: 

127 result.append(_parse_name(author["display_name"])) 

128 elif isinstance(author, str): 128 ↛ 109line 128 didn't jump to line 109 because the condition on line 128 was always true

129 result.append(_parse_name(author)) 

130 

131 return result if result else None 

132 

133 

134def _parse_name(name: str) -> dict: 

135 """Parse a name string into CSL {"family", "given"} format.""" 

136 name = name.strip() 

137 if not name: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 return {"literal": ""} 

139 

140 # Handle "Last, First" format 

141 if "," in name: 

142 parts = name.split(",", 1) 

143 return {"family": parts[0].strip(), "given": parts[1].strip()} 

144 

145 # Handle "First Last" format 

146 parts = name.rsplit(" ", 1) 

147 if len(parts) == 2: 

148 return {"given": parts[0].strip(), "family": parts[1].strip()} 

149 

150 return {"literal": name} 

151 

152 

153def _parse_date(source: dict) -> tuple[Optional[date], Optional[int]]: 

154 """Extract publication date and year from various formats. 

155 

156 Returns (date_obj, year_int). Either may be None. 

157 """ 

158 year = None 

159 

160 # Try explicit year field 

161 raw_year = source.get("year") or source.get("publication_year") 

162 if raw_year: 

163 try: 

164 year = int(raw_year) 

165 except (ValueError, TypeError): 

166 pass 

167 

168 # Try date string 

169 date_str = ( 

170 source.get("publication_date") 

171 or source.get("published") 

172 or source.get("date") 

173 or source.get("pubdate") 

174 ) 

175 if date_str and isinstance(date_str, str): 

176 # Try ISO format: YYYY-MM-DD 

177 match = re.match(r"(\d{4})-(\d{1,2})-(\d{1,2})", date_str) 

178 if match: 178 ↛ 187line 178 didn't jump to line 187 because the condition on line 178 was always true

179 try: 

180 d = date(int(match[1]), int(match[2]), int(match[3])) 

181 if year is None: 

182 year = d.year 

183 return d, year 

184 except ValueError: 

185 pass 

186 # Try just year 

187 if year is None: 

188 match = re.match(r"(\d{4})", date_str) 

189 if match: 

190 year = int(match[1]) 

191 

192 return None, year 

193 

194 

195def _extract_arxiv_id(source: dict) -> Optional[str]: 

196 """Extract arXiv ID from URL or explicit field.""" 

197 arxiv_id = source.get("arxiv_id") 

198 if arxiv_id: 

199 return arxiv_id 

200 

201 url = source.get("link", "") or source.get("url", "") 

202 # Old-style (pre-Apr 2007): archive(.subject-class)?/YYMMNNN, e.g. 

203 # cond-mat/0501001, math.AG/0601001, hep-th/9802150. 

204 # New-style: YYMM.NNNN or YYMM.NNNNN (5-digit seq from 2015 onwards), 

205 # with optional vN version suffix, e.g. 2501.12345, 0704.0001v2. 

206 match = re.search( 

207 r"arxiv\.org/abs/((?:[a-z-]+(?:\.[A-Z]+)?/\d{7}|\d{4}\.\d{4,5})(?:v\d+)?)", 

208 url, 

209 ) 

210 if match: 210 ↛ 212line 210 didn't jump to line 212 because the condition on line 210 was always true

211 return match.group(1) 

212 return None 

213 

214 

215def _extract_doi(source: dict) -> Optional[str]: 

216 """Extract a DOI string from a result/source dict. 

217 

218 Tries (in order): 

219 1. ``source["doi"]`` — set by ArXiv, NASA ADS, OpenAlex 

220 2. ``source["external_ids"]["DOI"]`` / ``externalIds["DOI"]`` — Semantic 

221 Scholar style 

222 3. A DOI embedded in ``source["link"]`` (https://doi.org/...) 

223 

224 URL prefixes (``https://doi.org/``, ``http://doi.org/``, ``doi:``) are 

225 stripped so the returned value is a bare DOI like ``10.1038/...``. This 

226 is the single source of truth for DOI extraction across the codebase — 

227 `openalex_enrichment` and other modules import it from here. 

228 """ 

229 doi = source.get("doi") 

230 if isinstance(doi, list): 

231 doi = doi[0] if doi else None 

232 

233 # Semantic Scholar exposes DOIs through external_ids / externalIds 

234 if not doi: 

235 ext_ids = source.get("external_ids") or source.get("externalIds") or {} 

236 if isinstance(ext_ids, dict): 236 ↛ 242line 236 didn't jump to line 242 because the condition on line 236 was always true

237 doi = ext_ids.get("DOI") or ext_ids.get("doi") 

238 

239 # DOI embedded in a link URL — anchor on scheme to avoid CodeQL 

240 # py/incomplete-url-substring-sanitization (an attacker-controlled 

241 # URL path could contain "doi.org/" otherwise). 

242 if not doi: 

243 link = source.get("link") or "" 

244 if isinstance(link, str): 244 ↛ 255line 244 didn't jump to line 255 because the condition on line 244 was always true

245 for prefix in ( 

246 "https://doi.org/", 

247 "http://doi.org/", 

248 "https://dx.doi.org/", 

249 "http://dx.doi.org/", 

250 ): 

251 if link.startswith(prefix): 

252 doi = link[len(prefix) :] 

253 break 

254 

255 if not doi: 

256 return None 

257 

258 doi = str(doi) 

259 for prefix in ( 

260 "https://doi.org/", 

261 "http://doi.org/", 

262 "https://dx.doi.org/", 

263 "http://dx.doi.org/", 

264 "doi:", 

265 ): 

266 if doi.startswith(prefix): 

267 doi = doi[len(prefix) :] 

268 break 

269 return doi if doi else None 

270 

271 

272def normalize_citation(source: dict) -> Optional[dict]: 

273 """Normalize a search result dict into citation metadata fields. 

274 

275 Returns a dict of field values ready for Paper creation, 

276 or None if the source is not an academic result. 

277 

278 The returned dict has keys matching Paper column names. 

279 """ 

280 engine = detect_engine(source) 

281 if engine is None: 

282 return None 

283 

284 pub_date, year = _parse_date(source) 

285 doi = _extract_doi(source) 

286 

287 # Guard against source["metadata"] being a non-dict truthy value 

288 # (e.g., a string) — .get({}).get() crashes with AttributeError 

289 # in that case because the default only applies when the key is 

290 # absent/None. 

291 nested_meta = source.get("metadata") 

292 if not isinstance(nested_meta, dict): 292 ↛ 298line 292 didn't jump to line 298 because the condition on line 292 was always true

293 nested_meta = {} 

294 

295 # Prefer a structured CSL list (e.g. NASA ADS publishes "Last, First" 

296 # names which lose their pairing if re-split from a comma-joined 

297 # display string) over the display-string fallback. 

298 authors_input = ( 

299 source.get("authors_csl") 

300 or nested_meta.get("authors_csl") 

301 or source.get("authors") 

302 or nested_meta.get("authors") 

303 ) 

304 authors = _parse_authors_list(authors_input) 

305 

306 # Container title (journal/conference name). 

307 # Checks CSL-style ``container_title`` and ``container-title`` too 

308 # so callers that already use CSL vocabulary don't have their 

309 # journal silently dropped. 

310 container = ( 

311 source.get("journal_ref") 

312 or source.get("journal") 

313 or source.get("venue") 

314 or source.get("container_title") 

315 or source.get("container-title") 

316 or nested_meta.get("journal") 

317 ) 

318 # Placeholder sentinels from upstream engines (e.g. OpenAlex / 

319 # NASA ADS set these when no venue is indexed). Filter them out 

320 # so they don't become container_title literals — there's an 

321 # actual OpenAlex source named "unknown" (Q1, h_index=5) that 

322 # would otherwise get hit by the name-based lookup. 

323 if isinstance(container, str) and container.strip().lower() in ( 

324 "unknown", 

325 "", 

326 ): 

327 container = None 

328 

329 # Item type (CSL vocabulary) 

330 item_type = _infer_item_type(engine, source) 

331 

332 fields = { 

333 "source_engine": engine, 

334 "doi": doi, 

335 "arxiv_id": _extract_arxiv_id(source) if engine == "arxiv" else None, 

336 "pmid": source.get("pmid"), 

337 "pmcid": source.get("pmcid"), 

338 "authors": authors, 

339 "publication_date": pub_date, 

340 "year": year, 

341 "volume": source.get("volume"), 

342 "issue": source.get("issue"), 

343 "pages": source.get("pages"), 

344 "container_title": container, 

345 "publisher": source.get("publisher"), 

346 "item_type": item_type, 

347 } 

348 

349 # Build CSL-JSON item (uses the date object while still native) 

350 fields["csl_json"] = _build_csl_json(source, fields) 

351 

352 # Convert publication_date to ISO string so it survives JSON 

353 # serialization when stored in paper_metadata. _build_csl_json has 

354 # already consumed the native date object above. 

355 if fields.get("publication_date") is not None: 

356 fields["publication_date"] = fields["publication_date"].isoformat() 

357 

358 # Strip None values to avoid unnecessary DB writes 

359 return {k: v for k, v in fields.items() if v is not None} 

360 

361 

362def _infer_item_type(engine: str, source: dict) -> str: 

363 """Infer CSL item type from engine and source metadata.""" 

364 source_type = source.get("source_type", "") 

365 if source_type == "conference": 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true

366 return "paper-conference" 

367 

368 if engine == "arxiv": 

369 # ArXiv papers with journal_ref are published; without are preprints 

370 return "article-journal" if source.get("journal_ref") else "article" 

371 

372 return "article-journal" 

373 

374 

375def _build_csl_json(source: dict, fields: dict) -> dict: 

376 """Build a CSL-JSON item from normalized fields.""" 

377 csl: dict[str, Any] = { 

378 "type": fields.get("item_type", "article-journal"), 

379 "title": source.get("title", ""), 

380 } 

381 

382 if fields.get("authors"): 

383 csl["author"] = fields["authors"] 

384 

385 if fields.get("container_title"): 

386 csl["container-title"] = fields["container_title"] 

387 

388 if fields.get("doi"): 

389 csl["DOI"] = fields["doi"] 

390 

391 if fields.get("volume"): 

392 csl["volume"] = fields["volume"] 

393 if fields.get("issue"): 

394 csl["issue"] = fields["issue"] 

395 if fields.get("pages"): 

396 csl["page"] = fields["pages"] 

397 

398 if fields.get("publisher"): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 csl["publisher"] = fields["publisher"] 

400 

401 if fields.get("year"): 

402 date_parts = [[fields["year"]]] 

403 if fields.get("publication_date"): 

404 d = fields["publication_date"] 

405 date_parts = [[d.year, d.month, d.day]] 

406 csl["issued"] = {"date-parts": date_parts} 

407 

408 url = source.get("link") or source.get("url") 

409 if url: 

410 csl["URL"] = url 

411 

412 return csl