Coverage for src/local_deep_research/utilities/openalex_enrichment.py: 98%
73 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Batch DOI → OpenAlex source_id enrichment.
4Resolves paper DOIs to OpenAlex source IDs in a single batch HTTP request
5(up to 50 DOIs per call). This populates ``openalex_source_id`` and
6``source_type`` on result dicts so the journal reputation filter can look
7up journals by ID rather than fragile name matching.
9Runs as a pre-enrichment layer before content filters — the existing
10tiered scoring system is unchanged. This just gives Tier 2 (OpenAlex
11snapshot lookup) a reliable key to work with.
12"""
14from typing import Any, Dict, List, Optional
16from loguru import logger
18from ..constants import OPENALEX_ENRICHMENT_API_TIMEOUT, USER_AGENT
19from ..security.safe_requests import safe_get
20from .citation_normalizer import _extract_doi
23_OPENALEX_API = "https://api.openalex.org"
24_MAX_DOIS_PER_REQUEST = 50
27def _normalize_doi(doi: str) -> str:
28 """Normalize a DOI to ``https://doi.org/<...>`` form for OpenAlex.
30 The ``startswith`` prefix checks below are fully anchored and
31 CodeQL-safe. A previous code-scanning bot comment cited alert 7635
32 (``py/incomplete-url-substring-sanitization``) against an earlier
33 snapshot of this file; the current CodeQL scan does not raise it,
34 and the anchored ``startswith`` pattern is the rule's recommended
35 mitigation. Refactoring to a bare-first normalization was
36 evaluated (PR #3081) but rejected as no-op churn — the
37 ``https://doi.org/`` form OpenAlex actually returns round-trips
38 unchanged through every branch here. Do not refactor without a
39 new, reproducible functional issue.
40 """
41 doi = doi.strip()
42 if doi.startswith("https://doi.org/"):
43 return doi
44 if doi.startswith("http://doi.org/"):
45 return doi.replace("http://", "https://")
46 if doi.startswith("10."):
47 return f"https://doi.org/{doi}"
48 return doi
51def enrich_results_with_source_ids(
52 results: List[Dict[str, Any]],
53 email: Optional[str] = None,
54) -> List[Dict[str, Any]]:
55 """
56 Batch-enrich results with OpenAlex source_id by DOI lookup.
58 For results that have a DOI but no ``openalex_source_id``, makes a
59 single batch request to the OpenAlex works endpoint to resolve
60 DOI → journal/conference source_id.
62 The results list is modified in-place and also returned.
64 Args:
65 results: List of result dicts from search engines.
66 email: Optional email for OpenAlex polite pool.
68 Returns:
69 The same results list, with ``openalex_source_id`` and
70 ``source_type`` injected where resolved.
71 """
72 if not results:
73 return results
75 # Collect DOIs for results that need enrichment
76 doi_to_indices: Dict[str, List[int]] = {}
77 for i, result in enumerate(results):
78 # Skip results that already have a source_id
79 if result.get("openalex_source_id"):
80 continue
82 doi = _extract_doi(result)
83 if doi:
84 normalized = _normalize_doi(doi)
85 doi_to_indices.setdefault(normalized, []).append(i)
87 if not doi_to_indices:
88 logger.debug("DOI enrichment: no DOIs to resolve")
89 return results
91 # Batch DOIs into chunks of MAX_DOIS_PER_REQUEST
92 all_dois = list(doi_to_indices.keys())
93 enriched_count = 0
95 for chunk_start in range(0, len(all_dois), _MAX_DOIS_PER_REQUEST):
96 chunk = all_dois[chunk_start : chunk_start + _MAX_DOIS_PER_REQUEST]
97 doi_filter = "|".join(chunk)
99 params = {
100 "filter": f"doi:{doi_filter}",
101 "per_page": str(len(chunk)),
102 "select": "doi,primary_location",
103 }
104 if email:
105 params["mailto"] = email
107 # safe_get auto-injects the project User-Agent. We only override
108 # it here when an email is configured so OpenAlex's polite pool
109 # can identify us. The mailto query param above also achieves
110 # the polite-pool effect on its own.
111 headers: Dict[str, str] = {"Accept": "application/json"}
112 if email:
113 headers["User-Agent"] = f"{USER_AGENT} ({email})"
115 try:
116 response = safe_get(
117 f"{_OPENALEX_API}/works",
118 params=params,
119 headers=headers,
120 timeout=OPENALEX_ENRICHMENT_API_TIMEOUT,
121 )
122 if response.status_code != 200:
123 logger.warning(
124 f"DOI enrichment: OpenAlex returned {response.status_code}"
125 )
126 continue
128 data = response.json()
129 works = data.get("results", [])
131 for work in works:
132 work_doi = work.get("doi", "")
133 if not work_doi: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 continue
136 # Normalize for matching
137 work_doi_normalized = _normalize_doi(work_doi)
139 # Extract source info
140 location = work.get("primary_location") or {}
141 source = location.get("source") or {}
142 source_id_raw = source.get("id", "")
143 source_type = source.get("type")
145 if not source_id_raw:
146 continue
148 # Extract short ID from URL
149 source_id = source_id_raw.split("/")[-1]
151 # Apply to all results with this DOI
152 indices = doi_to_indices.get(work_doi_normalized, [])
153 for idx in indices:
154 results[idx]["openalex_source_id"] = source_id
155 if source_type:
156 results[idx]["source_type"] = source_type
157 enriched_count += 1
159 except Exception:
160 logger.exception("DOI enrichment: OpenAlex batch lookup failed")
161 # Graceful: results pass through unenriched
162 continue
164 if enriched_count > 0:
165 logger.info(
166 f"DOI enrichment: resolved {enriched_count} of "
167 f"{len(doi_to_indices)} DOIs to OpenAlex source IDs"
168 )
169 else:
170 logger.debug(
171 f"DOI enrichment: no matches from {len(doi_to_indices)} DOIs"
172 )
174 return results