Coverage for src/local_deep_research/utilities/citation

1"""

2Normalize search engine result dicts into structured citation metadata.

4Converts engine-specific field names and formats into CSL-JSON vocabulary.

5Each engine has its own dict shape; this module handles the differences.

6"""

8import re

9from datetime import date

10from typing import Any, Optional

13__all__ = [

14 "normalize_citation",

15 "normalize_issn",

16 "detect_engine",

17]

20_ISSN_CHARS = re.compile(r"[^0-9Xx]")

23def normalize_issn(s: Optional[str]) -> Optional[str]:

24 """Canonicalize an ISSN to the 8-character no-dash form.

26 Strips dashes, whitespace, and any non-digit/non-X characters; uppercases

27 a trailing "x" check digit. Returns the canonical 8-char form, or None if

28 the input is missing or cannot be coerced into 8 characters.

30 Examples:

31 normalize_issn("1522-9645") == "15229645"

32 normalize_issn("15229645") == "15229645"

33 normalize_issn("1234-567x") == "1234567X"

34 normalize_issn("bad") is None

35 normalize_issn(None) is None

37 This is a structural canonicalization — it does not verify the ISSN

38 checksum. The goal is format-independent equality for lookup.

39 """

40 if not s:

41 return None

42 cleaned = _ISSN_CHARS.sub("", s).upper()

43 if len(cleaned) != 8:

44 return None

45 return cleaned

48# Academic source engines that produce citation-worthy metadata

49ACADEMIC_ENGINES = {

50 "arxiv",

51 "openalex",

52 "semantic_scholar",

53 "pubmed",

54 "nasa_ads",

55}

57# URL patterns to detect source engine from URLs

58_ENGINE_PATTERNS = [

59 (re.compile(r"arxiv\.org"), "arxiv"),

60 (re.compile(r"openalex\.org"), "openalex"),

61 (re.compile(r"semanticscholar\.org"), "semantic_scholar"),

62 (re.compile(r"ncbi\.nlm\.nih\.gov|pubmed"), "pubmed"),

63 (re.compile(r"adsabs\.harvard\.edu|ui\.adsabs"), "nasa_ads"),

64 (re.compile(r"doi\.org"), "doi"),

65]

68def detect_engine(source: dict) -> Optional[str]:

69 """Detect which search engine produced this result.

71 Checks explicit source_engine field first, then URL patterns.

72 Returns None for non-academic sources (web, news, etc.).

73 """

74 # Explicit engine field

75 engine = source.get("source_engine") or source.get("source")

76 if engine:

77 engine_lower = engine.lower().strip()

78 if engine_lower in ACADEMIC_ENGINES: 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true

79 return engine_lower

81 # Detect from URL

82 url = source.get("link", "") or source.get("url", "")

83 for pattern, engine_name in _ENGINE_PATTERNS:

84 if pattern.search(url):

85 return engine_name

87 return None

90def _parse_authors_list(authors: Any) -> Optional[list[dict]]:

91 """Convert various author formats to CSL-JSON name objects.

93 Handles:

94 - List of strings: ["John Smith", "Jane Doe"]

95 - Comma-separated string: "John Smith, Jane Doe"

96 - List of dicts with "name": [{"name": "John Smith"}]

97 - Already CSL format: [{"family": "Smith", "given": "John"}]

98 """

99 if not authors:

100 return None

101

102 if isinstance(authors, str):

103 authors = [a.strip() for a in authors.split(",") if a.strip()]

104

105 if not isinstance(authors, list): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 return None

107

108 result = []

109 for author in authors:

110 if isinstance(author, dict):

111 if "family" in author:

112 # Whitelist only CSL name fields to ensure the dict is

113 # JSON-serializable. Engines like OpenAlex and

114 # Semantic Scholar attach nested affiliation objects,

115 # ORCIDs, etc., which may contain non-primitive types

116 # that would crash json.dumps() when stored in the

117 # paper_metadata JSON column.

118 safe = {"family": author["family"]}

119 if "given" in author: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 safe["given"] = author["given"]

121 if "suffix" in author: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 safe["suffix"] = author["suffix"]

123 result.append(safe)

124 elif "name" in author: 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was always true

125 result.append(_parse_name(author["name"]))

126 elif "display_name" in author:

127 result.append(_parse_name(author["display_name"]))

128 elif isinstance(author, str): 128 ↛ 109line 128 didn't jump to line 109 because the condition on line 128 was always true

129 result.append(_parse_name(author))

130

131 return result if result else None

132

133

134def _parse_name(name: str) -> dict:

135 """Parse a name string into CSL {"family", "given"} format."""

136 name = name.strip()

137 if not name: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 return {"literal": ""}

139

140 # Handle "Last, First" format

141 if "," in name:

142 parts = name.split(",", 1)

143 return {"family": parts[0].strip(), "given": parts[1].strip()}

144

145 # Handle "First Last" format

146 parts = name.rsplit(" ", 1)

147 if len(parts) == 2:

148 return {"given": parts[0].strip(), "family": parts[1].strip()}

149

150 return {"literal": name}

151

152

153def _parse_date(source: dict) -> tuple[Optional[date], Optional[int]]:

154 """Extract publication date and year from various formats.

155

156 Returns (date_obj, year_int). Either may be None.

157 """

158 year = None

159

160 # Try explicit year field

161 raw_year = source.get("year") or source.get("publication_year")

162 if raw_year:

163 try:

164 year = int(raw_year)

165 except (ValueError, TypeError):

166 pass

167

168 # Try date string

169 date_str = (

170 source.get("publication_date")

171 or source.get("published")

172 or source.get("date")

173 or source.get("pubdate")

174 )

175 if date_str and isinstance(date_str, str):

176 # Try ISO format: YYYY-MM-DD

177 match = re.match(r"(\d{4})-(\d{1,2})-(\d{1,2})", date_str)

178 if match: 178 ↛ 187line 178 didn't jump to line 187 because the condition on line 178 was always true

179 try:

180 d = date(int(match[1]), int(match[2]), int(match[3]))

181 if year is None:

182 year = d.year

183 return d, year

184 except ValueError:

185 pass

186 # Try just year

187 if year is None:

188 match = re.match(r"(\d{4})", date_str)

189 if match:

190 year = int(match[1])

191

192 return None, year

193

194

195def _extract_arxiv_id(source: dict) -> Optional[str]:

196 """Extract arXiv ID from URL or explicit field."""

197 arxiv_id = source.get("arxiv_id")

198 if arxiv_id:

199 return arxiv_id

200

201 url = source.get("link", "") or source.get("url", "")

202 # Old-style (pre-Apr 2007): archive(.subject-class)?/YYMMNNN, e.g.

203 # cond-mat/0501001, math.AG/0601001, hep-th/9802150.

204 # New-style: YYMM.NNNN or YYMM.NNNNN (5-digit seq from 2015 onwards),

205 # with optional vN version suffix, e.g. 2501.12345, 0704.0001v2.

206 match = re.search(

207 r"arxiv\.org/abs/((?:[a-z-]+(?:\.[A-Z]+)?/\d{7}|\d{4}\.\d{4,5})(?:v\d+)?)",

208 url,

209 )

210 if match: 210 ↛ 212line 210 didn't jump to line 212 because the condition on line 210 was always true

211 return match.group(1)

212 return None

213

214

215def _extract_doi(source: dict) -> Optional[str]:

216 """Extract a DOI string from a result/source dict.

217

218 Tries (in order):

219 1. ``source["doi"]`` — set by ArXiv, NASA ADS, OpenAlex

220 2. ``source["external_ids"]["DOI"]`` / ``externalIds["DOI"]`` — Semantic

221 Scholar style

222 3. A DOI embedded in ``source["link"]`` (https://doi.org/...)

223

224 URL prefixes (``https://doi.org/``, ``http://doi.org/``, ``doi:``) are

225 stripped so the returned value is a bare DOI like ``10.1038/...``. This

226 is the single source of truth for DOI extraction across the codebase —

227 `openalex_enrichment` and other modules import it from here.

228 """

229 doi = source.get("doi")

230 if isinstance(doi, list):

231 doi = doi[0] if doi else None

232

233 # Semantic Scholar exposes DOIs through external_ids / externalIds

234 if not doi:

235 ext_ids = source.get("external_ids") or source.get("externalIds") or {}

236 if isinstance(ext_ids, dict): 236 ↛ 242line 236 didn't jump to line 242 because the condition on line 236 was always true

237 doi = ext_ids.get("DOI") or ext_ids.get("doi")

238

239 # DOI embedded in a link URL — anchor on scheme to avoid CodeQL

240 # py/incomplete-url-substring-sanitization (an attacker-controlled

241 # URL path could contain "doi.org/" otherwise).

242 if not doi:

243 link = source.get("link") or ""

244 if isinstance(link, str): 244 ↛ 255line 244 didn't jump to line 255 because the condition on line 244 was always true

245 for prefix in (

246 "https://doi.org/",

247 "http://doi.org/",

248 "https://dx.doi.org/",

249 "http://dx.doi.org/",

250 ):

251 if link.startswith(prefix):

252 doi = link[len(prefix) :]

253 break

254

255 if not doi:

256 return None

257

258 doi = str(doi)

259 for prefix in (

260 "https://doi.org/",

261 "http://doi.org/",

262 "https://dx.doi.org/",

263 "http://dx.doi.org/",

264 "doi:",

265 ):

266 if doi.startswith(prefix):

267 doi = doi[len(prefix) :]

268 break

269 return doi if doi else None

270

271

272def normalize_citation(source: dict) -> Optional[dict]:

273 """Normalize a search result dict into citation metadata fields.

274

275 Returns a dict of field values ready for Paper creation,

276 or None if the source is not an academic result.

277

278 The returned dict has keys matching Paper column names.

279 """

280 engine = detect_engine(source)

281 if engine is None:

282 return None

283

284 pub_date, year = _parse_date(source)

285 doi = _extract_doi(source)

286

287 # Guard against source["metadata"] being a non-dict truthy value

288 # (e.g., a string) — .get({}).get() crashes with AttributeError

289 # in that case because the default only applies when the key is

290 # absent/None.

291 nested_meta = source.get("metadata")

292 if not isinstance(nested_meta, dict): 292 ↛ 298line 292 didn't jump to line 298 because the condition on line 292 was always true

293 nested_meta = {}

294

295 # Prefer a structured CSL list (e.g. NASA ADS publishes "Last, First"

296 # names which lose their pairing if re-split from a comma-joined

297 # display string) over the display-string fallback.

298 authors_input = (

299 source.get("authors_csl")

300 or nested_meta.get("authors_csl")

301 or source.get("authors")

302 or nested_meta.get("authors")

303 )

304 authors = _parse_authors_list(authors_input)

305

306 # Container title (journal/conference name).

307 # Checks CSL-style ``container_title`` and ``container-title`` too

308 # so callers that already use CSL vocabulary don't have their

309 # journal silently dropped.

310 container = (

311 source.get("journal_ref")

312 or source.get("journal")

313 or source.get("venue")

314 or source.get("container_title")

315 or source.get("container-title")

316 or nested_meta.get("journal")

317 )

318 # Placeholder sentinels from upstream engines (e.g. OpenAlex /

319 # NASA ADS set these when no venue is indexed). Filter them out

320 # so they don't become container_title literals — there's an

321 # actual OpenAlex source named "unknown" (Q1, h_index=5) that

322 # would otherwise get hit by the name-based lookup.

323 if isinstance(container, str) and container.strip().lower() in (

324 "unknown",

325 "",

326 ):

327 container = None

328

329 # Item type (CSL vocabulary)

330 item_type = _infer_item_type(engine, source)

331

332 fields = {

333 "source_engine": engine,

334 "doi": doi,

335 "arxiv_id": _extract_arxiv_id(source) if engine == "arxiv" else None,

336 "pmid": source.get("pmid"),

337 "pmcid": source.get("pmcid"),

338 "authors": authors,

339 "publication_date": pub_date,

340 "year": year,

341 "volume": source.get("volume"),

342 "issue": source.get("issue"),

343 "pages": source.get("pages"),

344 "container_title": container,

345 "publisher": source.get("publisher"),

346 "item_type": item_type,

347 }

348

349 # Build CSL-JSON item (uses the date object while still native)

350 fields["csl_json"] = _build_csl_json(source, fields)

351

352 # Convert publication_date to ISO string so it survives JSON

353 # serialization when stored in paper_metadata. _build_csl_json has

354 # already consumed the native date object above.

355 if fields.get("publication_date") is not None:

356 fields["publication_date"] = fields["publication_date"].isoformat()

357

358 # Strip None values to avoid unnecessary DB writes

359 return {k: v for k, v in fields.items() if v is not None}

360

361

362def _infer_item_type(engine: str, source: dict) -> str:

363 """Infer CSL item type from engine and source metadata."""

364 source_type = source.get("source_type", "")

365 if source_type == "conference": 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true

366 return "paper-conference"

367

368 if engine == "arxiv":

369 # ArXiv papers with journal_ref are published; without are preprints

370 return "article-journal" if source.get("journal_ref") else "article"

371

372 return "article-journal"

373

374

375def _build_csl_json(source: dict, fields: dict) -> dict:

376 """Build a CSL-JSON item from normalized fields."""

377 csl: dict[str, Any] = {

378 "type": fields.get("item_type", "article-journal"),

379 "title": source.get("title", ""),

380 }

381

382 if fields.get("authors"):

383 csl["author"] = fields["authors"]

384

385 if fields.get("container_title"):

386 csl["container-title"] = fields["container_title"]

387

388 if fields.get("doi"):

389 csl["DOI"] = fields["doi"]

390

391 if fields.get("volume"):

392 csl["volume"] = fields["volume"]

393 if fields.get("issue"):

394 csl["issue"] = fields["issue"]

395 if fields.get("pages"):

396 csl["page"] = fields["pages"]

397

398 if fields.get("publisher"): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 csl["publisher"] = fields["publisher"]

400

401 if fields.get("year"):

402 date_parts = [[fields["year"]]]

403 if fields.get("publication_date"):

404 d = fields["publication_date"]

405 date_parts = [[d.year, d.month, d.day]]

406 csl["issued"] = {"date-parts": date_parts}

407

408 url = source.get("link") or source.get("url")

409 if url:

410 csl["URL"] = url

411

412 return csl

Coverage for src/local_deep_research/utilities/citation_normalizer.py: 87%

169 statements