Coverage for src/local_deep_research/journal_quality/data

1"""OpenAlex sources data source.

3Downloads the **bulk snapshot** of ~280K journals + conferences from

4``openalex.s3.amazonaws.com`` and writes a compact gzipped JSON file

5used by the in-memory tier-2 lookups.

7This is the public OpenAlex S3 snapshot — CC0 licensed, no auth, no

8rate limits, ~350 MB compressed across ~120 partition files. We stream

9each part, extract the few fields we need, and discard the rest. Total

10wall-clock is ~30–60 s on a normal connection.

12Why bulk snapshot instead of the REST API

13-----------------------------------------

15The previous implementation paginated ``/api/sources`` with

16``per_page=200`` and a 100 ms polite-rate-limiting sleep between

17requests. For ~280K records that meant ~1,400 HTTP requests + ~140 s

18of pure sleep, plus actual transfer time — wall-clock 5–10 minutes,

19and that's the time the user spends staring at the dashboard

20"Download Data" button. The S3 dump is the recommended bulk path per

21OpenAlex docs and finishes in well under a minute.

22"""

24from __future__ import annotations

26import gzip

27import json

28import time

29from pathlib import Path

31from loguru import logger

33from ...utilities.citation_normalizer import normalize_issn

34from .base import DataSource

36from ._openalex_common import (

37 OPENALEX_S3_BASE,

38 iter_partitions,

39 validate_manifest_entries,

40)

42_OPENALEX_SOURCES_MANIFEST = (

43 f"{OPENALEX_S3_BASE}/data/jsonl/sources/manifest.json"

44)

47class SchemaDriftError(RuntimeError):

48 """OpenAlex renamed / removed a required field in the snapshot.

50 A row-count floor catches the case where the whole fetch collapses,

51 but not the case where every row loads but a key field (``h_index``,

52 ``cited_by_count``) is silently None. We refuse to overwrite the

53 existing snapshot in that case — better to keep the old data than

54 rebuild an all-None DB that would quietly reclassify every journal

55 into the "unknown" quality tier.

56 """

59class OpenAlexSource(DataSource):

60 key = "openalex" # gitleaks:allow

61 name = "OpenAlex"

62 url = "https://openalex.org"

63 dataset_url = (

64 "https://docs.openalex.org/download-all-data/openalex-snapshot"

65 )

66 license = "CC0 1.0"

67 license_url = "https://creativecommons.org/publicdomain/zero/1.0/"

68 description = (

69 "~280K journals and conferences with h-index, impact factor, "

70 "and publisher metadata"

71 )

72 filename = "openalex_sources.json.gz"

73 count_label = "OpenAlex sources"

74 auto_download = False # large; user opts in via dashboard

75 required = True # bulk-download fatal-on-failure

76 approx_size_mb = 13.0 # final compact output, NOT the raw snapshot

78 def fetch(self, data_dir: Path, progress_cb=None) -> int:

79 from ...security.safe_requests import (

80 safe_get_with_retries as safe_get,

81 )

83 # 1. Fetch the manifest. Tells us which partition files exist

84 # and how many records to expect — so we can give the user

85 # an accurate progress log instead of just dots.

86 logger.info(

87 f"Fetching OpenAlex sources manifest: {_OPENALEX_SOURCES_MANIFEST}"

88 )

89 # consume_body=True: the manifest is small but a serial

90 # bottleneck for the whole download. A body-read transient

91 # here aborts everything.

92 manifest_resp = safe_get(

93 _OPENALEX_SOURCES_MANIFEST, timeout=30, consume_body=True

94 )

95 manifest_resp.raise_for_status()

96 manifest = manifest_resp.json()

98 # OpenAlex's 2026-06 "standard-format" snapshot moved the data to

99 # ``data/jsonl/<entity>/`` and renamed the manifest's part list from

100 # ``entries`` to ``files``. Each file entry still has ``url``

101 # (s3://openalex/...) + ``meta.record_count`` / ``meta.content_length``.

102 entries = manifest.get("files", [])

103 total_records = sum(

104 e.get("meta", {}).get("record_count", 0) for e in entries

105 )

106 total_bytes = sum(

107 e.get("meta", {}).get("content_length", 0) for e in entries

108 )

109 logger.info(

110 f"OpenAlex sources snapshot: {len(entries)} parts, "

111 f"{total_records:,} records, "

112 f"{total_bytes / 1024 / 1024:.0f} MB compressed"

113 )

114

115 # Validate manifest URLs before fetching anything, so a

116 # compromised manifest cannot redirect fetches to an arbitrary

117 # host. Legitimate OpenAlex manifest entries always use the

118 # s3://openalex/ prefix.

119 validate_manifest_entries(entries, "OpenAlex sources")

120

121 # 2. Stream-process each part. The ``iter_partitions`` helper

122 # in ``_openalex_common`` owns the tmp-file lifecycle + the

123 # first-10 malformed-JSON suppression; we just consume

124 # records and track our own schema-drift counters.

125 type_map = {"journal": "j", "conference": "c"}

126 sources: dict = {}

127 # Raw parse counters feed the ``id``-rename drift check below —

128 # records without an ``id`` are silently skipped at the point of

129 # extraction, so we need to know the ratio to distinguish a

130 # collapsed fetch from a renamed identifier field.

131 parsed_records = 0

132 parsed_with_id = 0

133 start = time.time()

134

135 for idx, total_parts, records in iter_partitions(

136 entries,

137 data_dir,

138 file_prefix="openalex_sources",

139 label="OpenAlex sources",

140 safe_get=safe_get,

141 ):

142 for rec in records:

143 parsed_records += 1

144 src_id = (rec.get("id") or "").split("/")[-1]

145 if not src_id:

146 continue

147 parsed_with_id += 1

148

149 stats = rec.get("summary_stats") or {}

150 compact = {

151 "n": rec.get("display_name", ""),

152 "t": type_map.get(rec.get("type", ""), rec.get("type", "")),

153 "h": stats.get("h_index"),

154 "if": stats.get("2yr_mean_citedness"),

155 "cb": rec.get("cited_by_count"),

156 "p": rec.get("host_organization_name") or "",

157 "i": normalize_issn(rec.get("issn_l")),

158 }

159 if rec.get("is_in_doaj"): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 compact["d"] = 1

161 if rec.get("is_core"): 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 compact["s"] = 1

163 sources[src_id] = compact

164

165 if (idx + 1) % 5 == 0 or idx == total_parts - 1: 165 ↛ 173line 165 didn't jump to line 173 because the condition on line 165 was always true

166 elapsed = time.time() - start

167 logger.info(

168 f"OpenAlex sources: processed {idx + 1}/{total_parts} "

169 f"parts ({len(sources):,} records, {elapsed:.0f}s)"

170 )

171 # Report on EVERY partition, not just every 5th — the UI

172 # needs smoother updates than the human-readable log.

173 if progress_cb is not None: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 try:

175 progress_cb(

176 idx + 1,

177 total_parts,

178 f"{len(sources):,} records",

179 )

180 except Exception:

181 logger.debug(

182 "OpenAlex progress_cb raised; continuing",

183 exc_info=True,

184 )

185

186 # ``id``-rename drift runs *before* the row-count floor: a

187 # renamed identifier makes every record drop out at parse time

188 # (src_id empty → continue), so ``sources`` is empty and the

189 # row-count floor would fire first with a generic RuntimeError

190 # that hides the actual cause.

191 if parsed_records >= 10_000 and parsed_with_id == 0:

192 raise SchemaDriftError(

193 f"OpenAlex snapshot parsed {parsed_records:,} records but "

194 "none carried an 'id' field — the source identifier may "

195 "have been renamed (e.g. to 'source_id'). Refusing to "

196 "overwrite existing data — please check "

197 "https://docs.openalex.org/download-all-data/"

198 "snapshot-data-format for schema changes."

199 )

200

201 # 3. Write the compact snapshot in the same shape the existing

202 # build pipeline expects (`{"s": {src_id: compact}}`).

203 # Sanity check: OpenAlex normally has ~280K sources. If the

204 # fetch silently returned a tiny subset (e.g., every partition

205 # returned an empty shard), refuse to overwrite existing data.

206 _MIN_OPENALEX_SOURCES = 10_000

207 if len(sources) < _MIN_OPENALEX_SOURCES: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 raise RuntimeError(

209 f"OpenAlex sources: suspiciously few records "

210 f"({len(sources):,} < {_MIN_OPENALEX_SOURCES:,}); "

211 "refusing to overwrite existing data"

212 )

213

214 # Field-level schema drift detection. The row-count floor above

215 # catches a collapsed fetch, but not a silent upstream rename.

216 # The journal-only sample avoids false-triggering on snapshots

217 # that skew to non-journal types (conferences, repositories,

218 # etc.) which legitimately lack ``h_index``.

219 _SCHEMA_SAMPLE_SIZE = 100

220 journal_sample = [r for r in sources.values() if r.get("t") == "j"][

221 :_SCHEMA_SAMPLE_SIZE

222 ]

223 if len(journal_sample) >= _SCHEMA_SAMPLE_SIZE: 223 ↛ 243line 223 didn't jump to line 243 because the condition on line 223 was always true

224 has_hindex = any(r.get("h") is not None for r in journal_sample)

225 has_cited = any(r.get("cb") is not None for r in journal_sample)

226 if not has_hindex or not has_cited:

227 raise SchemaDriftError(

228 "OpenAlex snapshot appears to have renamed a required "

229 "field: "

230 f"h_index present in journal sample={has_hindex}, "

231 f"cited_by_count present in journal sample={has_cited}. "

232 "Refusing to overwrite existing data — please check "

233 "https://docs.openalex.org/download-all-data/"

234 "snapshot-data-format for schema changes."

235 )

236 else:

237 # The row-count floor above already refuses a collapsed

238 # fetch. This branch only fires in unusual cases (truncated

239 # test snapshot, snapshot with very few journal-typed

240 # sources). Log at info so operators see the drift check

241 # was bypassed — debug would be invisible at production

242 # log levels.

243 logger.info(

244 "OpenAlex schema-drift check skipped: "

245 f"only {len(journal_sample)} journal source(s) in sample "

246 f"(< {_SCHEMA_SAMPLE_SIZE} required)"

247 )

248

249 output = data_dir / self.filename

250 tmp = data_dir / f"{self.filename}.tmp"

251 with gzip.open(tmp, "wt", encoding="utf-8") as f:

252 json.dump({"s": sources}, f)

253 tmp.rename(output)

254

255 elapsed = time.time() - start

256 logger.info(

257 f"OpenAlex sources: saved {len(sources):,} sources in {elapsed:.0f}s"

258 )

259 return len(sources)

Coverage for src/local_deep_research/journal_quality/data_sources/openalex.py: 87%

83 statements