Coverage for src/local_deep_research/journal_quality/data

1"""OpenAlex Institutions data source.

3Downloads the **bulk snapshot** of ~120K institutions from the public

4OpenAlex S3 gateway (``openalex.s3.amazonaws.com``) and writes a

5compact gzipped JSON snapshot used by the institution-scoring tier of

6the journal filter. Each institution carries its ROR ID, country,

7h-index, works count, and 2-year mean citedness.

9Why bulk snapshot instead of the REST API

10-----------------------------------------

12The previous implementation cursor-paginated ``/api/institutions`` at

13200/page with 100 ms polite sleeps — ~550 requests + ~55 s of sleep +

14actual transfer. Wall-clock was 5–10 minutes per "Download Data"

15click, which dominated the user-facing latency. The S3 dump is the

16documented bulk path, CC0, no auth, no rate limits, and finishes

17much faster.

18"""

20from __future__ import annotations

22import gzip

23import json

24import time

25from pathlib import Path

27from loguru import logger

29from ._openalex_common import (

30 OPENALEX_S3_BASE,

31 iter_partitions,

32 validate_manifest_entries,

33)

34from ..scoring import normalize_name

35from .base import DataSource

37_OPENALEX_INSTITUTIONS_MANIFEST = (

38 f"{OPENALEX_S3_BASE}/data/jsonl/institutions/manifest.json"

39)

41# Safety floor — OpenAlex has ~120K institutions. Refuse to overwrite

42# good data if the fetch produced far fewer records (likely schema

43# change, empty partitions, or broken manifest).

44_MIN_INSTITUTIONS = 50_000

47class InstitutionSource(DataSource):

48 key = "institutions" # gitleaks:allow

49 name = "OpenAlex Institutions"

50 url = "https://openalex.org"

51 dataset_url = (

52 "https://docs.openalex.org/download-all-data/openalex-snapshot"

53 )

54 license = "CC0 1.0"

55 license_url = "https://creativecommons.org/publicdomain/zero/1.0/"

56 description = (

57 "~120K research institutions with ROR ID, country, h-index, "

58 "works count, and citation counts"

59 )

60 filename = "openalex_institutions.json.gz"

61 count_label = "institutions"

62 auto_download = False # large; user opts in via dashboard

63 required = False

64 approx_size_mb = 10.0 # final compact output, NOT the raw snapshot

66 def fetch(self, data_dir: Path, progress_cb=None) -> int:

67 from ...security.safe_requests import (

68 safe_get_with_retries as safe_get,

69 )

71 # 1. Fetch the manifest.

72 logger.info(

73 f"Fetching OpenAlex institutions manifest: "

74 f"{_OPENALEX_INSTITUTIONS_MANIFEST}"

75 )

76 # consume_body=True: small JSON but serial bottleneck — a body

77 # transient here aborts the whole 10-min institutions pull.

78 manifest_resp = safe_get(

79 _OPENALEX_INSTITUTIONS_MANIFEST, timeout=30, consume_body=True

80 )

81 manifest_resp.raise_for_status()

82 manifest = manifest_resp.json()

84 # OpenAlex's 2026-06 "standard-format" snapshot renamed the

85 # manifest's part list from ``entries`` to ``files`` (see

86 # ``openalex.py``). Entry shape (url + meta) is unchanged.

87 entries = manifest.get("files", [])

89 # Validate every part URL before fetching any part — SSRF

90 # defense in depth. If any entry points outside the OpenAlex

91 # bucket we refuse the whole fetch rather than partially trust.

92 validate_manifest_entries(entries, "Institutions")

94 total_records = sum(

95 e.get("meta", {}).get("record_count", 0) for e in entries

96 )

97 total_bytes = sum(

98 e.get("meta", {}).get("content_length", 0) for e in entries

99 )

100 logger.info(

101 f"OpenAlex institutions snapshot: {len(entries)} parts, "

102 f"{total_records:,} records, "

103 f"{total_bytes / 1024 / 1024:.0f} MB compressed"

104 )

105

106 # 2. Stream-process each part. The ``iter_partitions`` helper

107 # in ``_openalex_common`` owns the tmp-file lifecycle and

108 # first-10 malformed-JSON suppression; we focus on the

109 # compact-format + secondary-index extraction. Compact

110 # format matches the journal sources snapshot; ROR and

111 # name indexes keep the runtime lookup path O(1).

112 institutions: dict = {}

113 ror_index: dict = {}

114 name_index: dict = {}

115 start = time.time()

116

117 for idx, total_parts, records in iter_partitions(

118 entries,

119 data_dir,

120 file_prefix="openalex_institutions",

121 label="Institutions",

122 safe_get=safe_get,

123 ):

124 for inst in records:

125 inst_id = (inst.get("id") or "").split("/")[-1]

126 if not inst_id: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 continue

128

129 stats = inst.get("summary_stats") or {}

130 ror = (inst.get("ror") or "").rstrip("/").split("/")[-1]

131 compact = {

132 "n": inst.get("display_name", ""),

133 "c": inst.get("country_code", ""),

134 "t": inst.get("type", ""),

135 "h": stats.get("h_index"),

136 "if": stats.get("2yr_mean_citedness"),

137 "w": inst.get("works_count"),

138 "cb": inst.get("cited_by_count"),

139 "r": ror or None,

140 }

141 institutions[inst_id] = compact

142

143 if ror: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 ror_index[ror] = inst_id

145

146 primary = normalize_name(inst.get("display_name", "") or "")

147 if primary: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true

148 name_index[primary] = inst_id

149 for alt in inst.get("display_name_alternatives") or []: 149 ↛ 150line 149 didn't jump to line 150 because the loop on line 149 never started

150 alt_lower = normalize_name(alt or "")

151 if alt_lower and alt_lower not in name_index:

152 name_index[alt_lower] = inst_id

153

154 if (idx + 1) % 5 == 0 or idx == total_parts - 1: 154 ↛ 163line 154 didn't jump to line 163 because the condition on line 154 was always true

155 elapsed = time.time() - start

156 logger.info(

157 f"OpenAlex institutions: processed "

158 f"{idx + 1}/{total_parts} parts "

159 f"({len(institutions):,} records, {elapsed:.0f}s)"

160 )

161 # Per-partition UI ping so the dashboard bar moves

162 # frequently enough to feel live (vs every 5th partition).

163 if progress_cb is not None: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 try:

165 progress_cb(

166 idx + 1,

167 total_parts,

168 f"{len(institutions):,} records",

169 )

170 except Exception:

171 logger.debug(

172 "institutions progress_cb raised; continuing",

173 exc_info=True,

174 )

175

176 if len(institutions) < _MIN_INSTITUTIONS:

177 raise RuntimeError(

178 f"OpenAlex institutions: suspiciously few records "

179 f"({len(institutions):,} < {_MIN_INSTITUTIONS:,}); "

180 "refusing to overwrite existing data"

181 )

182

183 payload = {

184 "i": institutions, # institution ID → compact record

185 "r": ror_index, # ROR ID → institution ID

186 "nm": name_index, # lower-cased name → institution ID

187 }

188

189 output = data_dir / self.filename

190 tmp = data_dir / f"{self.filename}.tmp"

191 with gzip.open(tmp, "wt", encoding="utf-8") as f:

192 json.dump(payload, f)

193 tmp.rename(output)

194

195 elapsed = time.time() - start

196 logger.info(

197 f"OpenAlex institutions: saved {len(institutions):,} "

198 f"institutions in {elapsed:.0f}s"

199 )

200 return len(institutions)

Coverage for src/local_deep_research/journal_quality/data_sources/institutions.py: 82%

76 statements