Coverage for src/local_deep_research/utilities/openalex_enrichment.py: 98%

73 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Batch DOI → OpenAlex source_id enrichment. 

3 

4Resolves paper DOIs to OpenAlex source IDs in a single batch HTTP request 

5(up to 50 DOIs per call). This populates ``openalex_source_id`` and 

6``source_type`` on result dicts so the journal reputation filter can look 

7up journals by ID rather than fragile name matching. 

8 

9Runs as a pre-enrichment layer before content filters — the existing 

10tiered scoring system is unchanged. This just gives Tier 2 (OpenAlex 

11snapshot lookup) a reliable key to work with. 

12""" 

13 

14from typing import Any, Dict, List, Optional 

15 

16from loguru import logger 

17 

18from ..constants import OPENALEX_ENRICHMENT_API_TIMEOUT, USER_AGENT 

19from ..security.safe_requests import safe_get 

20from .citation_normalizer import _extract_doi 

21 

22 

23_OPENALEX_API = "https://api.openalex.org" 

24_MAX_DOIS_PER_REQUEST = 50 

25 

26 

27def _normalize_doi(doi: str) -> str: 

28 """Normalize a DOI to ``https://doi.org/<...>`` form for OpenAlex. 

29 

30 The ``startswith`` prefix checks below are fully anchored and 

31 CodeQL-safe. A previous code-scanning bot comment cited alert 7635 

32 (``py/incomplete-url-substring-sanitization``) against an earlier 

33 snapshot of this file; the current CodeQL scan does not raise it, 

34 and the anchored ``startswith`` pattern is the rule's recommended 

35 mitigation. Refactoring to a bare-first normalization was 

36 evaluated (PR #3081) but rejected as no-op churn — the 

37 ``https://doi.org/`` form OpenAlex actually returns round-trips 

38 unchanged through every branch here. Do not refactor without a 

39 new, reproducible functional issue. 

40 """ 

41 doi = doi.strip() 

42 if doi.startswith("https://doi.org/"): 

43 return doi 

44 if doi.startswith("http://doi.org/"): 

45 return doi.replace("http://", "https://") 

46 if doi.startswith("10."): 

47 return f"https://doi.org/{doi}" 

48 return doi 

49 

50 

51def enrich_results_with_source_ids( 

52 results: List[Dict[str, Any]], 

53 email: Optional[str] = None, 

54) -> List[Dict[str, Any]]: 

55 """ 

56 Batch-enrich results with OpenAlex source_id by DOI lookup. 

57 

58 For results that have a DOI but no ``openalex_source_id``, makes a 

59 single batch request to the OpenAlex works endpoint to resolve 

60 DOI → journal/conference source_id. 

61 

62 The results list is modified in-place and also returned. 

63 

64 Args: 

65 results: List of result dicts from search engines. 

66 email: Optional email for OpenAlex polite pool. 

67 

68 Returns: 

69 The same results list, with ``openalex_source_id`` and 

70 ``source_type`` injected where resolved. 

71 """ 

72 if not results: 

73 return results 

74 

75 # Collect DOIs for results that need enrichment 

76 doi_to_indices: Dict[str, List[int]] = {} 

77 for i, result in enumerate(results): 

78 # Skip results that already have a source_id 

79 if result.get("openalex_source_id"): 

80 continue 

81 

82 doi = _extract_doi(result) 

83 if doi: 

84 normalized = _normalize_doi(doi) 

85 doi_to_indices.setdefault(normalized, []).append(i) 

86 

87 if not doi_to_indices: 

88 logger.debug("DOI enrichment: no DOIs to resolve") 

89 return results 

90 

91 # Batch DOIs into chunks of MAX_DOIS_PER_REQUEST 

92 all_dois = list(doi_to_indices.keys()) 

93 enriched_count = 0 

94 

95 for chunk_start in range(0, len(all_dois), _MAX_DOIS_PER_REQUEST): 

96 chunk = all_dois[chunk_start : chunk_start + _MAX_DOIS_PER_REQUEST] 

97 doi_filter = "|".join(chunk) 

98 

99 params = { 

100 "filter": f"doi:{doi_filter}", 

101 "per_page": str(len(chunk)), 

102 "select": "doi,primary_location", 

103 } 

104 if email: 

105 params["mailto"] = email 

106 

107 # safe_get auto-injects the project User-Agent. We only override 

108 # it here when an email is configured so OpenAlex's polite pool 

109 # can identify us. The mailto query param above also achieves 

110 # the polite-pool effect on its own. 

111 headers: Dict[str, str] = {"Accept": "application/json"} 

112 if email: 

113 headers["User-Agent"] = f"{USER_AGENT} ({email})" 

114 

115 try: 

116 response = safe_get( 

117 f"{_OPENALEX_API}/works", 

118 params=params, 

119 headers=headers, 

120 timeout=OPENALEX_ENRICHMENT_API_TIMEOUT, 

121 ) 

122 if response.status_code != 200: 

123 logger.warning( 

124 f"DOI enrichment: OpenAlex returned {response.status_code}" 

125 ) 

126 continue 

127 

128 data = response.json() 

129 works = data.get("results", []) 

130 

131 for work in works: 

132 work_doi = work.get("doi", "") 

133 if not work_doi: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 continue 

135 

136 # Normalize for matching 

137 work_doi_normalized = _normalize_doi(work_doi) 

138 

139 # Extract source info 

140 location = work.get("primary_location") or {} 

141 source = location.get("source") or {} 

142 source_id_raw = source.get("id", "") 

143 source_type = source.get("type") 

144 

145 if not source_id_raw: 

146 continue 

147 

148 # Extract short ID from URL 

149 source_id = source_id_raw.split("/")[-1] 

150 

151 # Apply to all results with this DOI 

152 indices = doi_to_indices.get(work_doi_normalized, []) 

153 for idx in indices: 

154 results[idx]["openalex_source_id"] = source_id 

155 if source_type: 

156 results[idx]["source_type"] = source_type 

157 enriched_count += 1 

158 

159 except Exception: 

160 logger.exception("DOI enrichment: OpenAlex batch lookup failed") 

161 # Graceful: results pass through unenriched 

162 continue 

163 

164 if enriched_count > 0: 

165 logger.info( 

166 f"DOI enrichment: resolved {enriched_count} of " 

167 f"{len(doi_to_indices)} DOIs to OpenAlex source IDs" 

168 ) 

169 else: 

170 logger.debug( 

171 f"DOI enrichment: no matches from {len(doi_to_indices)} DOIs" 

172 ) 

173 

174 return results