Coverage for src/local_deep_research/journal

1"""Read-only SQLAlchemy accessor for the compiled journal-quality DB.

3INVARIANT: this module's `JournalQualityDB` class **never writes**.

4The runtime engine is opened with SQLite URI flags `mode=ro` and

5`immutable=1`, the file is `chmod 0o444` after every build, and a

6pre-commit hook bans cross-module opens of the file without `mode=ro`.

8The only writer is `build_db()` in this same module, which opens its

9own short-lived writable engine, populates the schema, runs ANALYZE

10+ VACUUM, closes the engine, and chmods the file back to 0o444.

12The DB compiles five gzipped JSON snapshots (downloaded by

13`journal_quality.downloader`) into one queryable file:

15- OpenAlex sources → `sources` table (with predatory + DOAJ flags)

16- Stop Predatory Journals → `predatory_journals/_publishers/_hijacked`

17- DOAJ → cross-referenced into `sources`

18- JabRef abbreviations → `abbreviations` table

19- OpenAlex Institutions → `institutions` table

21Built fresh on every download, no migrations.

22"""

24from __future__ import annotations

26import gzip

27import json

28import os

29import secrets

30import sqlite3

31import sys

32import threading

33import time

34from contextlib import contextmanager

35from pathlib import Path

36from typing import Iterable, Iterator, Optional

38from loguru import logger

39from sqlalchemy import create_engine, func, inspect, or_, select

40from sqlalchemy.engine import Engine

41from sqlalchemy.exc import DatabaseError, OperationalError

42from sqlalchemy.orm import Session, sessionmaker

44from .models import (

45 Abbreviation,

46 Institution,

47 JournalQualityBase,

48 PredatoryHijacked,

49 PredatoryJournal,

50 PredatoryPublisher,

51 Source,

52)

53from ..constants import PREDATORY_WHITELIST_HINDEX

54from ..utilities.citation_normalizer import normalize_issn

55from ..utilities.sql_utils import escape_like

56from .scoring import (

57 derive_quality_score,

58 institution_score_from_h_index,

59 normalize_name,

60)

62DB_FILENAME = "journal_quality.db"

63_BATCH_SIZE = 5000

65# Bump when the reference DB schema (models.py) changes in a way that

66# requires a rebuild even if the upstream data version hasn't changed.

67# Stamped as SQLite `PRAGMA user_version` during build_db; checked on

68# _ensure_engine. Separate from JOURNAL_DATA_VERSION (downloader.py)

69# which tracks upstream source-data freshness.

70# v4: dropped Source.has_doaj_seal — DOAJ retired the Seal in April 2025.

71JOURNAL_QUALITY_SCHEMA_VERSION = 4

73# Quality tier → score range, used by the dashboard tier filter.

74_TIER_RANGES = {

75 "elite": (9, 10),

76 "strong": (7, 8),

77 "moderate": (5, 6),

78 "low": (3, 4),

79 "predatory": (1, 2),

80}

82# Columns safe to use in ORDER BY (prevents injection via the dashboard

83# `sort` query parameter).

84_SORT_COLUMNS = frozenset(

85 {

86 "name",

87 "quality",

88 "quartile",

89 "h_index",

90 "impact_factor",

91 "score_source",

92 "source_type",

93 "publisher",

94 "is_predatory",

95 }

96)

98# Max length for user-supplied search strings. Even with LIKE

99# wildcards escaped, a 10 KB pattern against 217K rows is slow enough

100# to matter for CPU budget under concurrent requests.

101_MAX_SEARCH_LEN = 100

102

103

104# ---------------------------------------------------------------------------

105# Read-only accessor

106# ---------------------------------------------------------------------------

107

108

109class JournalQualityDB:

110 """Read-only SQLAlchemy 2.0 accessor for `journal_quality.db`.

111

112 All filter hot-path methods return plain dicts (not mapped Source

113 objects) so call sites in `journal_reputation_filter.py` keep the

114 same call shape they had against the dict-based predecessor. The

115 dashboard methods can return either dicts or Source instances —

116 they're called once per page view so the ORM overhead is fine.

117 """

118

119 def __init__(self) -> None:

120 self._engine: Optional[Engine] = None

121 self._SessionLocal: Optional[sessionmaker[Session]] = None

122 # RLock (not Lock): _ensure_engine holds the lock while calling

123 # _build_or_raise → build_db → reset_db → self.reset(), which

124 # re-acquires the same lock. A non-reentrant Lock would deadlock

125 # the very first request on a fresh install.

126 self._lock = threading.RLock()

127 # Whether we've already logged a stale-data-version warning for

128 # this engine lifetime. Prevents log spam — one WARNING per

129 # server start is enough to surface the problem to admins.

130 self._stale_version_warned = False

131

132 # --- engine + session lifecycle ---

133

134 def _resolve_db_path(self) -> Path:

135 from ..config.paths import get_journal_data_directory

136

137 return get_journal_data_directory() / DB_FILENAME

138

139 def _ensure_engine(self) -> None:

140 # Acquire lock BEFORE first read to avoid DCLP publication hazard.

141 # With the GIL this is safe on CPython but explicit locking makes

142 # the happens-before relationship clear and portable.

143 with self._lock:

144 if self._engine is not None:

145 return

146 path = self._resolve_db_path()

147 if not path.exists(): 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 self._build_or_raise(path)

149 else:

150 # Validate existing file before wiring up the read-only

151 # engine. Catches two failure modes at open time instead

152 # of letting them propagate to first query:

153 # 1. Schema drift — ORM changed since this file was

154 # built (PRAGMA user_version mismatch) → rebuild.

155 # 2. Corruption — file exists but isn't a valid DB

156 # (truncated build, disk error) → rebuild.

157 if not self._validate_existing_db(path): 157 ↛ 168line 157 didn't jump to line 168 because the condition on line 157 was always true

158 self._build_or_raise(path)

159

160 # mode=ro + immutable=1: SQLite physically refuses writes,

161 # skips locking entirely, and reads via mmap. The OS page

162 # cache holds one shared resident copy of the hot pages.

163 #

164 # Use a creator callback because SQLAlchemy's URL parser

165 # eats the ?mode=ro&immutable=1 query string before it can

166 # reach sqlite3. The creator builds the connection directly

167 # with the SQLite URI flags intact.

168 def _make_ro_conn() -> sqlite3.Connection:

169 return sqlite3.connect(

170 f"file:{path}?mode=ro&immutable=1",

171 uri=True,

172 check_same_thread=False,

173 )

174

175 # StaticPool: with immutable=1 SQLite skips locking and the

176 # OS page cache handles concurrency. A single shared connection

177 # is safe and avoids the default QueuePool's 15-connection

178 # footprint that offers no benefit for immutable reads.

179 from sqlalchemy.pool import StaticPool

180

181 engine = create_engine(

182 "sqlite://",

183 creator=_make_ro_conn,

184 poolclass=StaticPool,

185 echo=False,

186 )

187 session_local = sessionmaker(bind=engine, expire_on_commit=False)

188 # Publish both together so readers never see engine-without-session.

189 self._engine = engine

190 self._SessionLocal = session_local

191 logger.info(f"Opened journal_quality.db (read-only): {path}")

192 # One-shot check: is the DATA version (the one the sources

193 # JSON + build logic produce) behind the bundled latest?

194 # Schema drift is already handled by `_validate_existing_db`

195 # via ``PRAGMA user_version``. A data-version mismatch is a

196 # different concern: the DB schema is fine, but the scoring

197 # logic (e.g. the repository cap) or source snapshots have

198 # been updated since this file was built. The hot path

199 # (filter scoring) would silently serve stale scores if we

200 # didn't surface the mismatch anywhere but the admin

201 # dashboard. Log once, don't auto-rebuild — user consent

202 # via the dashboard "Download Data" button remains the

203 # explicit refresh trigger.

204 self._warn_on_stale_data_version(path.parent)

205

206 def _warn_on_stale_data_version(self, data_dir: Path) -> None:

207 """Log WARNING once if ``version.json`` is behind ``JOURNAL_DATA_VERSION``."""

208 if self._stale_version_warned:

209 return

210 # Lazy import to avoid any downloader → db cycle even though

211 # today's module graph doesn't have one.

212 from .downloader import JOURNAL_DATA_VERSION

213

214 version_file = data_dir / "version.json"

215 if not version_file.exists():

216 return # Brand-new install — the dashboard's banner handles this.

217 try:

218 with open(version_file, encoding="utf-8") as f:

219 info = json.load(f)

220 installed = info.get("version")

221 except (json.JSONDecodeError, OSError):

222 return # Malformed — dashboard's banner surfaces; don't double-log.

223 if installed and installed != JOURNAL_DATA_VERSION:

224 logger.warning(

225 f"journal_quality data version is stale: on-disk={installed!r} "

226 f"bundled-latest={JOURNAL_DATA_VERSION!r}. "

227 f"Scoring is continuing with the older data. Visit "

228 f"/metrics/journals and click 'Download Data' to refresh."

229 )

230 self._stale_version_warned = True

231

232 def _validate_existing_db(self, path: Path) -> bool:

233 """Return True if the existing DB file is usable as-is.

234

235 A version of 0 means the file was built before schema stamping

236 existed and is grandfathered in — we don't force a rebuild just

237 because the stamp is missing. A non-zero version that doesn't

238 match the current schema is a real drift signal and triggers a

239 rebuild. File-open errors also trigger a rebuild.

240 """

241 from ..utilities.resource_utils import safe_close

242

243 conn: Optional[sqlite3.Connection] = None

244 try:

245 conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True)

246 version = conn.execute("PRAGMA user_version").fetchone()[0]

247 if version != 0 and version != JOURNAL_QUALITY_SCHEMA_VERSION:

248 logger.warning(

249 f"journal_quality.db schema_version={version}, "

250 f"expected {JOURNAL_QUALITY_SCHEMA_VERSION} — "

251 f"rebuilding"

252 )

253 # NB: no explicit safe_close here — the finally block

254 # handles closing. Calling it twice produced a spurious

255 # "Cannot operate on a closed database" warning on

256 # every schema-triggered rebuild.

257 self._unlink_unusable_db(path)

258 return False

259 # Cheap sanity check — confirms the file is a valid DB.

260 conn.execute("SELECT 1 FROM sqlite_master LIMIT 1").fetchone()

261 return True

262 except (sqlite3.DatabaseError, OSError):

263 logger.exception(

264 f"journal_quality.db at {path} is unusable; rebuilding"

265 )

266 self._unlink_unusable_db(path)

267 return False

268 finally:

269 if conn is not None:

270 safe_close(conn, "journal_quality validate")

271

272 @staticmethod

273 def _unlink_unusable_db(path: Path) -> None:

274 """Best-effort cleanup of a corrupted / schema-drifted DB file.

275

276 Corruption was already logged by the caller (``_validate_existing_db``).

277 Both operations below are best-effort — on failure we *log and

278 continue* rather than raise, because the build path will rebuild

279 the file regardless. But we don't silence: if chmod / unlink

280 fails (permissions, read-only mount, file held open on Windows)

281 the next build will likely also fail and the user needs the

282 warning to diagnose the real problem.

283 """

284 try:

285 # bearer:disable python_lang_file_permissions

286 os.chmod(path, 0o644)

287 except OSError:

288 logger.warning(

289 f"Could not chmod 0644 on unusable DB {path} before "

290 f"unlink (continuing to unlink attempt)"

291 )

292 try:

293 path.unlink()

294 except OSError:

295 logger.warning(

296 f"Could not unlink unusable DB {path} (will be "

297 f"overwritten on next build)"

298 )

299

300 def _build_or_raise(self, path: Path) -> None:

301 """Lazy-build the DB on first access if it's missing."""

302 from .downloader import ensure_journal_data

303

304 data_dir, available = ensure_journal_data()

305 if not available:

306 raise FileNotFoundError(

307 "Journal data files not available. "

308 "Check your network connection or download manually "

309 "from the dashboard."

310 )

311 logger.info(f"Building {DB_FILENAME} from data files...")

312 build_db(data_dir=data_dir, output_path=path)

313

314 @contextmanager

315 def session(self) -> Iterator[Session]:

316 """Yield a read-only SQLAlchemy session.

317

318 If the underlying file becomes corrupt mid-session (e.g. a

319 rebuild ran and this engine is pointed at a now-unlinked inode),

320 DatabaseError propagates but we drop the cached engine so the

321 next call rebuilds cleanly instead of failing forever.

322 """

323 self._ensure_engine()

324 if self._SessionLocal is None: 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true

325 raise RuntimeError("JournalQualityDB engine failed to initialize")

326 from ..utilities.resource_utils import safe_close

327

328 s = self._SessionLocal()

329 try:

330 yield s

331 except (OperationalError, DatabaseError):

332 logger.exception("journal_quality.db error — resetting engine")

333 safe_close(s, "journal_quality session")

334 self.reset()

335 raise

336 else:

337 safe_close(s, "journal_quality session")

338

339 @property

340 def available(self) -> bool:

341 try:

342 self._ensure_engine()

343 return True

344 except FileNotFoundError:

345 return False

346

347 def reset(self) -> None:

348 """Drop the cached engine — call after `build_db` rebuilds the file."""

349 with self._lock:

350 if self._engine is not None: 350 ↛ exitline 350 didn't jump to the function exit

351 self._engine.dispose()

352 self._engine = None

353 self._SessionLocal = None

354 logger.info("Reset journal_quality.db engine")

355

356 # --- filter hot path: return plain dicts ---

357

358 def lookup_openalex(

359 self,

360 *,

361 source_id: Optional[str] = None,

362 issn: Optional[str] = None,

363 name: Optional[str] = None,

364 ) -> Optional[dict]:

365 """Look up a source by OpenAlex ID, ISSN, or name.

366

367 Returns a dict with the same shape the dict-based predecessor

368 produced (`name`, `type`, `h_index`, `impact_factor`,

369 `is_in_doaj`, `publisher`, `issn_l`) so the filter code at

370 `journal_reputation_filter.py` doesn't need to change.

371 """

372 issn = normalize_issn(issn)

373 try:

374 self._ensure_engine()

375 except FileNotFoundError:

376 return None

377 with self.session() as s:

378 row = self._lookup_source_row(s, source_id, issn, name)

379 return _source_to_lookup_dict(row) if row else None

380

381 # Alias used by the dashboard / future call sites

382 lookup_source = lookup_openalex

383

384 def count_predatory_by_names(self, names: Iterable[str]) -> int:

385 """Count how many of the given journal names are flagged predatory.

386

387 One SQL round-trip using ``WHERE name_lower IN (…) AND is_predatory = TRUE``.

388 Names are normalized (NFKC + lower + strip) so the caller can pass raw

389 display names; matches the normalization used at build time.

390

391 Used by the per-user metrics dashboard to report a global "N predatory

392 journals across all your research" stat without making N round trips

393 to the reference DB. Returns 0 if the reference DB is missing or if

394 ``names`` is empty.

395

396 .. note::

397 Deliberately unchunked. SQLite's ``SQLITE_MAX_VARIABLE_NUMBER``

398 has been 250,000 since SQLite 3.32 (2020); Python 3.11 ships

399 with 3.45.1. A heavy user with 100k distinct container_titles

400 is still well under the limit. Re-confirmed in the PR #3081

401 audit — no chunking needed.

402 """

403 normed = {normalize_name(n) for n in names if n}

404 normed.discard("")

405 if not normed:

406 return 0

407 try:

408 self._ensure_engine()

409 except FileNotFoundError:

410 return 0

411 with self.session() as s:

412 stmt = select(func.count(Source.id)).where(

413 Source.name_lower.in_(normed),

414 Source.is_predatory.is_(True),

415 )

416 result = s.execute(stmt).scalar()

417 return int(result or 0)

418

419 def lookup_sources_batch(self, names: Iterable[str]) -> dict:

420 """Batch-look-up multiple journal names in one query.

421

422 Takes an iterable of raw display names and returns a

423 ``{normalized_name: dashboard_dict}`` map for every name that

424 matched a Source. Names that didn't match are simply absent

425 from the result (caller decides how to handle misses).

426

427 Dashboard hot path: the ``/api/journals/user-research`` endpoint

428 collects up to 200 unique ``container_title`` values from the

429 user's Papers and hands them in here — one SQL round-trip vs.

430 200 per-row lookups.

431

432 Normalization matches ``normalize_name`` (NFKC + lower + strip)

433 so the reference DB's ``name_lower`` column hits directly. No

434 "the " / "proceedings of" fallback tiers — those live in

435 ``_lookup_source_row`` for precision per-call; the batch path

436 is for dashboard display where a miss is acceptable.

437

438 Chunked at 900 params per chunk. Defensive: SQLite's actual

439 limit (``SQLITE_MAX_VARIABLE_NUMBER``) has been 250,000 since

440 3.32 (2020) — we could easily put the whole batch in one IN —

441 but 900 keeps us well under any older embedded-SQLite ceiling

442 a deployment might pin to.

443 """

444 normed = [normalize_name(n) for n in names if n]

445 normed = [n for n in normed if n]

446 if not normed:

447 return {}

448 try:

449 self._ensure_engine()

450 except FileNotFoundError:

451 return {}

452 # De-duplicate while preserving insertion order for stable

453 # iteration in tests.

454 seen: set = set()

455 uniq: list = []

456 for n in normed:

457 if n not in seen:

458 seen.add(n)

459 uniq.append(n)

460

461 out: dict = {}

462 CHUNK = 900

463 with self.session() as s:

464 for i in range(0, len(uniq), CHUNK):

465 batch = uniq[i : i + CHUNK]

466 stmt = select(Source).where(Source.name_lower.in_(batch))

467 for row in s.scalars(stmt):

468 out[row.name_lower] = _source_to_dashboard_dict(row)

469 return out

470

471 def lookup_doaj(self, *, issn: Optional[str] = None) -> Optional[dict]:

472 issn = normalize_issn(issn)

473 if not issn:

474 return None

475 try:

476 self._ensure_engine()

477 except FileNotFoundError:

478 return None

479 with self.session() as s:

480 stmt = (

481 select(Source)

482 .where(Source.issn == issn, Source.is_in_doaj.is_(True))

483 .limit(1)

484 )

485 row = s.scalars(stmt).first()

486 if row is None:

487 return None

488 return {

489 "name": row.name,

490 "publisher": row.publisher,

491 }

492

493 def is_in_doaj(self, issn: Optional[str]) -> bool:

494 return self.lookup_doaj(issn=issn) is not None

495

496 def is_predatory(

497 self,

498 *,

499 journal_name: Optional[str] = None,

500 publisher_name: Optional[str] = None,

501 ) -> tuple[bool, Optional[str]]:

502 """Check if a journal/publisher is on the predatory list.

503

504 Looks up the dedicated predatory tables (NOT just the

505 `is_predatory` flag on `Source`), so checks work for arbitrary

506 input names that aren't in OpenAlex.

507 """

508 try:

509 self._ensure_engine()

510 except FileNotFoundError:

511 return False, None

512 with self.session() as s:

513 if journal_name: 513 ↛ 520line 513 didn't jump to line 520 because the condition on line 513 was always true

514 norm = normalize_name(journal_name)

515 if s.get(PredatoryJournal, norm) is not None: 515 ↛ 516line 515 didn't jump to line 516 because the condition on line 515 was never true

516 return True, "stop-predatory-journals"

517 if s.get(PredatoryHijacked, norm) is not None: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true

518 return True, "stop-predatory-hijacked"

519

520 if publisher_name: 520 ↛ 521line 520 didn't jump to line 521 because the condition on line 520 was never true

521 pub_norm = normalize_name(publisher_name)

522 if s.get(PredatoryPublisher, pub_norm) is not None:

523 return True, "stop-predatory-publishers"

524 # Substring scan over long entries (~1162 rows)

525 stmt = select(PredatoryPublisher.name_lower).where(

526 PredatoryPublisher.is_long.is_(True)

527 )

528 for (entry,) in s.execute(stmt).all():

529 if pub_norm in entry or entry in pub_norm:

530 return True, "stop-predatory-publishers"

531

532 return False, None

533

534 def is_whitelisted(

535 self,

536 *,

537 issn: Optional[str] = None,

538 name: Optional[str] = None,

539 ) -> bool:

540 if self.is_in_doaj(issn):

541 return True

542 oa = self.lookup_openalex(issn=issn, name=name)

543 if oa and (oa.get("h_index") or 0) > PREDATORY_WHITELIST_HINDEX:

544 return True

545 return False

546

547 def lookup_institution(

548 self,

549 *,

550 ror_id: Optional[str] = None,

551 openalex_id: Optional[str] = None,

552 name: Optional[str] = None,

553 ) -> Optional[dict]:

554 """Look up an institution.

555

556 Order: openalex_id → ror → name. Returns a dict with full-name

557 keys (``name``, ``country``, ``type``, ``h_index``,

558 ``impact_factor``, ``works_count``, ``cited_by_count``, ``ror_id``)

559 or ``None`` if no match. The on-disk snapshot uses one-character

560 keys (``n``, ``c``, etc.) for space efficiency; the accessor

561 returns full names instead for legibility and schema robustness.

562 """

563 try:

564 self._ensure_engine()

565 except FileNotFoundError:

566 return None

567 with self.session() as s:

568 row: Optional[Institution] = None

569

570 if openalex_id:

571 sid = openalex_id.split("/")[-1]

572 row = s.get(Institution, sid)

573

574 if row is None and ror_id:

575 ror = ror_id.rstrip("/").split("/")[-1]

576 stmt = (

577 select(Institution)

578 .where(Institution.ror_id == ror)

579 .limit(1)

580 )

581 row = s.scalars(stmt).first()

582

583 if row is None and name:

584 norm = normalize_name(name)

585 stmt = (

586 select(Institution)

587 .where(Institution.name_lower == norm)

588 .limit(1)

589 )

590 row = s.scalars(stmt).first()

591

592 return _institution_to_dict(row) if row else None

593

594 def score_from_affiliations(self, affiliations: list) -> Optional[int]:

595 """Derive a score from author affiliations in ONE SQL query."""

596 if not affiliations:

597 return None

598

599 openalex_ids: list[str] = []

600 ror_ids: list[str] = []

601 names: list[str] = []

602

603 for aff in affiliations:

604 if isinstance(aff, str):

605 names.append(normalize_name(aff))

606 elif isinstance(aff, dict):

607 if oid := (aff.get("openalex_id") or aff.get("id")):

608 openalex_ids.append(oid.split("/")[-1])

609 if rid := aff.get("ror"):

610 ror_ids.append(rid.rstrip("/").split("/")[-1])

611 if nm := aff.get("name"):

612 names.append(normalize_name(nm))

613

614 if not (openalex_ids or ror_ids or names):

615 return None

616

617 try:

618 self._ensure_engine()

619 except FileNotFoundError:

620 return None

621

622 clauses = []

623 if openalex_ids:

624 clauses.append(Institution.openalex_id.in_(openalex_ids))

625 if ror_ids:

626 clauses.append(Institution.ror_id.in_(ror_ids))

627 if names:

628 clauses.append(Institution.name_lower.in_(names))

629

630 with self.session() as s:

631 stmt = select(func.max(Institution.h_index)).where(

632 or_(*clauses), Institution.h_index.is_not(None)

633 )

634 best_h = s.scalar(stmt)

635

636 # Single source of truth for institution scoring lives in

637 # scoring.py — delegate so the build phase, the runtime filter,

638 # and this affiliation-salvage path can never disagree.

639 return institution_score_from_h_index(best_h)

640

641 # Static passthrough so the filter can call dm.derive_quality_score(...)

642 # without importing from .scoring directly. Single home for the

643 # scoring rules in scoring.py.

644 derive_quality_score = staticmethod(derive_quality_score)

645

646 def expand_abbreviation(self, name: str) -> Optional[str]:

647 if not name:

648 return None

649 try:

650 self._ensure_engine()

651 except FileNotFoundError:

652 return None

653 normalized = normalize_name(name)

654 with self.session() as s:

655 row = s.get(Abbreviation, normalized)

656 if row is not None:

657 return row.full_name

658 no_dots = normalized.replace(".", "").strip()

659 if no_dots != normalized:

660 row = s.get(Abbreviation, no_dots)

661 if row is not None:

662 return row.full_name

663 return None

664

665 # --- internal source lookup with name fallbacks ---

666

667 def _lookup_source_row(

668 self,

669 s: Session,

670 source_id: Optional[str],

671 issn: Optional[str],

672 name: Optional[str],

673 ) -> Optional[Source]:

674 if source_id: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true

675 sid = source_id.split("/")[-1] if "/" in source_id else source_id

676 stmt = (

677 select(Source).where(Source.openalex_source_id == sid).limit(1)

678 )

679 row = s.scalars(stmt).first()

680 if row is not None:

681 return row

682

683 if issn: 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true

684 stmt = select(Source).where(Source.issn == issn).limit(1)

685 row = s.scalars(stmt).first()

686 if row is not None:

687 return row

688

689 if name: 689 ↛ 739line 689 didn't jump to line 739 because the condition on line 689 was always true

690 norm = normalize_name(name)

691 row = self._fetch_by_name_lower(s, norm)

692 if row is not None: 692 ↛ 693line 692 didn't jump to line 693 because the condition on line 692 was never true

693 return row

694 # Try with/without "the " prefix (~5K journals have it)

695 if norm.startswith("the "): 695 ↛ 696line 695 didn't jump to line 696 because the condition on line 695 was never true

696 row = self._fetch_by_name_lower(s, norm[4:])

697 else:

698 row = self._fetch_by_name_lower(s, "the " + norm)

699 if row is not None: 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true

700 return row

701 # Strip "proceedings of (the) (conference on) " prefix

702 stripped = norm

703 for prefix in (

704 "proceedings of the conference on ",

705 "proceedings of the ",

706 "proceedings of ",

707 ):

708 if stripped.startswith(prefix): 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true

709 stripped = stripped[len(prefix) :]

710 break

711 if stripped != norm: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 row = self._fetch_by_name_lower(s, stripped)

713 if row is not None:

714 return row

715

716 # MEDLINE-style "Title : long subtitle" — try the segment

717 # before the colon. Catches PubMed names like

718 # "Molecular therapy : the journal of the American Society..."

719 # → "Molecular therapy"

720 if " : " in norm: 720 ↛ 721line 720 didn't jump to line 721 because the condition on line 720 was never true

721 head = norm.split(" : ", 1)[0].strip()

722 if head and head != norm:

723 row = self._fetch_by_name_lower(s, head)

724 if row is not None:

725 return row

726

727 # MEDLINE-style "Title. Section name" — try the segment

728 # before the first period. Catches PubMed names like

729 # "Molecular therapy. Methods and clinical development"

730 # but only when the head is meaningfully shorter (we don't

731 # want to match "Nat" from "Nat. Commun.").

732 if "." in norm: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 head = norm.split(".", 1)[0].strip()

734 if head and len(head) >= 6 and head != norm:

735 row = self._fetch_by_name_lower(s, head)

736 if row is not None:

737 return row

738

739 return None

740

741 @staticmethod

742 def _fetch_by_name_lower(s: Session, name_lower: str) -> Optional[Source]:

743 stmt = select(Source).where(Source.name_lower == name_lower).limit(1)

744 return s.scalars(stmt).first()

745

746 # --- dashboard queries ---

747

748 def get_summary(self) -> dict:

749 if not self.available:

750 return {

751 "total": 0,

752 "avg_quality": 0,

753 "avg_h_index": None,

754 "predatory_count": 0,

755 "doaj_count": 0,

756 "llm_count": 0,

757 }

758

759 with self.session() as s:

760 row = s.execute(

761 select(

762 func.count().label("total"),

763 func.round(func.avg(Source.quality), 1).label(

764 "avg_quality"

765 ),

766 func.round(func.avg(Source.h_index)).label("avg_h_index"),

767 func.sum(func.iif(Source.is_predatory, 1, 0)).label(

768 "predatory_count"

769 ),

770 func.sum(func.iif(Source.is_in_doaj, 1, 0)).label(

771 "doaj_count"

772 ),

773 func.sum(

774 func.iif(Source.score_source == "llm", 1, 0)

775 ).label("llm_count"),

776 )

777 ).first()

778 return dict(row._mapping) if row else {}

779

780 def get_quality_distribution(self) -> dict[str, int]:

781 if not self.available:

782 return {}

783 with self.session() as s:

784 rows = s.execute(

785 select(Source.quality, func.count().label("cnt"))

786 .where(Source.quality.is_not(None))

787 .group_by(Source.quality)

788 .order_by(Source.quality)

789 ).all()

790 return {str(q): c for q, c in rows}

791

792 def get_source_distribution(self) -> dict[str, int]:

793 if not self.available:

794 return {}

795 with self.session() as s:

796 rows = s.execute(

797 select(

798 func.coalesce(Source.score_source, "unknown").label("src"),

799 func.count().label("cnt"),

800 ).group_by(Source.score_source)

801 ).all()

802 return {row.src: row.cnt for row in rows}

803

804 def get_journals_page(

805 self,

806 *,

807 page: int = 1,

808 per_page: int = 50,

809 search: str = "",

810 tier: str = "",

811 score_source: str = "",

812 sort: str = "quality",

813 order: str = "desc",

814 ) -> tuple[list[dict], int]:

815 if not self.available: 815 ↛ 816line 815 didn't jump to line 816 because the condition on line 815 was never true

816 return [], 0

817

818 if sort not in _SORT_COLUMNS: 818 ↛ 819line 818 didn't jump to line 819 because the condition on line 818 was never true

819 sort = "quality"

820 if order not in ("asc", "desc"): 820 ↛ 821line 820 didn't jump to line 821 because the condition on line 820 was never true

821 order = "desc"

822

823 wheres: list = []

824 if search: 824 ↛ 827line 824 didn't jump to line 827 because the condition on line 824 was always true

825 needle = escape_like(normalize_name(search)[:_MAX_SEARCH_LEN])

826 wheres.append(Source.name_lower.like(f"%{needle}%", escape="\\"))

827 if tier and tier in _TIER_RANGES: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true

828 lo, hi = _TIER_RANGES[tier]

829 wheres.append(Source.quality.between(lo, hi))

830 if score_source: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 wheres.append(Source.score_source == score_source)

832

833 sort_col = getattr(Source, sort)

834 order_clause = (

835 sort_col.desc().nulls_last()

836 if order == "desc"

837 else sort_col.asc().nulls_last()

838 )

839

840 offset = (max(1, page) - 1) * per_page

841

842 with self.session() as s:

843 total = (

844 s.scalar(

845 select(func.count()).select_from(Source).where(*wheres)

846 )

847 or 0

848 )

849 rows = s.scalars(

850 select(Source)

851 .where(*wheres)

852 .order_by(order_clause)

853 .limit(per_page)

854 .offset(offset)

855 ).all()

856

857 return [_source_to_dashboard_dict(r) for r in rows], total

858

859 def get_institutions_page(

860 self,

861 *,

862 page: int = 1,

863 per_page: int = 50,

864 search: str = "",

865 sort: str = "h_index",

866 order: str = "desc",

867 ) -> tuple[list[dict], int]:

868 if not self.available: 868 ↛ 869line 868 didn't jump to line 869 because the condition on line 868 was never true

869 return [], 0

870

871 # Defensive allowlist — matches the pattern in get_journals_page.

872 # The ternary below is already safe (non-"desc" falls through to

873 # .asc()), but the explicit check prevents future refactors from

874 # accidentally interpolating a tainted value into SQL.

875 if order not in ("asc", "desc"): 875 ↛ 876line 875 didn't jump to line 876 because the condition on line 875 was never true

876 order = "desc"

877

878 wheres = []

879 if search: 879 ↛ 885line 879 didn't jump to line 885 because the condition on line 879 was always true

880 needle = escape_like(normalize_name(search)[:_MAX_SEARCH_LEN])

881 wheres.append(

882 Institution.name_lower.like(f"%{needle}%", escape="\\")

883 )

884

885 sort_col = (

886 Institution.h_index if sort == "h_index" else Institution.name

887 )

888 order_clause = (

889 sort_col.desc().nulls_last()

890 if order == "desc"

891 else sort_col.asc().nulls_last()

892 )

893

894 offset = (max(1, page) - 1) * per_page

895

896 with self.session() as s:

897 total = (

898 s.scalar(

899 select(func.count()).select_from(Institution).where(*wheres)

900 )

901 or 0

902 )

903 rows = s.scalars(

904 select(Institution)

905 .where(*wheres)

906 .order_by(order_clause)

907 .limit(per_page)

908 .offset(offset)

909 ).all()

910

911 return [_institution_to_dashboard_dict(r) for r in rows], total

912

913

914# ---------------------------------------------------------------------------

915# Dict adapters — keep filter/dashboard call sites unchanged

916# ---------------------------------------------------------------------------

917

918

919def _source_to_lookup_dict(row: Source) -> dict:

920 """Convert a Source row to the dict shape `lookup_openalex` produces.

921

922 Includes `openalex_source_id` so dashboard / test code can chain

923 a follow-up `lookup_source(source_id=...)` call. Also exposes

924 ``quartile`` so the filter can store it on the per-user Journal row

925 and feed it into score derivation.

926 """

927 return {

928 "name": row.name,

929 "type": row.source_type,

930 "h_index": row.h_index,

931 "impact_factor": row.impact_factor,

932 "is_in_doaj": row.is_in_doaj,

933 "publisher": row.publisher,

934 "issn_l": row.issn,

935 "openalex_source_id": row.openalex_source_id,

936 "quartile": row.quartile,

937 }

938

939

940def _source_to_dashboard_dict(row: Source) -> dict:

941 return {

942 "name": row.name,

943 "quality": row.quality,

944 "quartile": row.quartile,

945 "cited_by_count": row.cited_by_count,

946 "h_index": row.h_index,

947 "impact_factor": (

948 round(row.impact_factor, 2) if row.impact_factor else None

949 ),

950 "is_in_doaj": bool(row.is_in_doaj),

951 "is_predatory": bool(row.is_predatory),

952 "predatory_source": row.predatory_source,

953 "score_source": row.score_source,

954 "source_type": row.source_type,

955 "publisher": row.publisher,

956 "issn": row.issn,

957 "openalex_source_id": row.openalex_source_id,

958 }

959

960

961def _institution_to_dict(row: Institution) -> dict:

962 """Public accessor shape for `lookup_institution`.

963

964 The on-disk JSON snapshot uses one-character keys (``n``, ``c``,

965 ``t``, …) purely for space efficiency — 200k institutions × seven

966 long field names adds real bytes. Callers of the accessor don't

967 care about on-disk layout, so here we return the full names to

968 keep the public API legible and robust to future schema tweaks.

969 """

970 return {

971 "name": row.name,

972 "country": row.country,

973 "type": row.type,

974 "h_index": row.h_index,

975 "impact_factor": row.impact_factor,

976 "works_count": row.works_count,

977 "cited_by_count": row.cited_by_count,

978 "ror_id": row.ror_id,

979 }

980

981

982def _institution_to_dashboard_dict(row: Institution) -> dict:

983 return {

984 "openalex_id": row.openalex_id,

985 "name": row.name,

986 "ror_id": row.ror_id,

987 "country": row.country,

988 "type": row.type,

989 "h_index": row.h_index,

990 "impact_factor": row.impact_factor,

991 "works_count": row.works_count,

992 "cited_by_count": row.cited_by_count,

993 }

994

995

996# ---------------------------------------------------------------------------

997# Module singleton

998# ---------------------------------------------------------------------------

999

1000

1001_db: Optional[JournalQualityDB] = None

1002_db_lock = threading.Lock()

1003

1004

1005def get_db() -> JournalQualityDB:

1006 """Get or create the singleton `JournalQualityDB`."""

1007 global _db

1008 if _db is None:

1009 with _db_lock:

1010 if _db is None: 1010 ↛ 1012line 1010 didn't jump to line 1012

1011 _db = JournalQualityDB()

1012 return _db

1013

1014

1015# Backwards-compat aliases used by metrics_routes.py and a couple of tests

1016get_journal_reference_db = get_db

1017JournalReferenceDB = JournalQualityDB

1018

1019

1020def reset_db() -> None:

1021 """Reset the cached engine after a build_db rebuild.

1022

1023 Held under `_db_lock` so a concurrent `get_db()` call can't see a

1024 half-disposed singleton — without the lock, Thread B could pass

1025 the `if _db is None` check in `get_db()` while Thread A is still

1026 inside `_db.reset()`, then call `_ensure_engine()` which short-

1027 circuits on the still-set `_engine` and hands back a disposed

1028 pool. The lock makes the read-then-reset pair atomic with respect

1029 to `get_db()`'s lazy-init path.

1030 """

1031 global _db

1032 with _db_lock:

1033 if _db is not None:

1034 _db.reset()

1035

1036

1037# ---------------------------------------------------------------------------

1038# Build phase — the ONLY writer

1039# ---------------------------------------------------------------------------

1040

1041

1042def build_db(

1043 data_dir: Optional[Path] = None,

1044 output_path: Optional[Path] = None,

1045) -> None:

1046 """Compile `journal_quality.db` from the gzipped JSON sources.

1047

1048 Opens a SHORT-LIVED writable engine, creates the schema, populates

1049 every table from the gz files, runs ANALYZE + VACUUM, closes the

1050 engine, then `chmod 0o444` the file.

1051 """

1052 start = time.time()

1053

1054 if data_dir is None:

1055 from ..config.paths import get_journal_data_directory

1056

1057 data_dir = get_journal_data_directory()

1058 if output_path is None:

1059 output_path = data_dir / DB_FILENAME

1060

1061 logger.info(

1062 "Building journal quality reference DB (one-time, "

1063 "~30s, decompresses ~25 MB of bundled data)…"

1064 )

1065

1066 # Sweep stale temp files from prior crashed builds so they don't

1067 # accumulate. Any .tmp-* older than 1h is assumed dead.

1068 _sweep_stale_tmp_files(output_path.parent, output_path.name)

1069

1070 # Build into a unique temp path, then os.replace() atomically at

1071 # the end. A random suffix (not a fixed .tmp) lets concurrent

1072 # builders write to separate files instead of racing on the same

1073 # path — os.replace picks a winner atomically and neither corrupts

1074 # the live file.

1075 tmp_path = output_path.with_name(

1076 f"{output_path.name}.tmp-{os.getpid()}-{secrets.token_hex(4)}"

1077 )

1078

1079 write_url = f"sqlite:///{tmp_path}"

1080 engine = create_engine(write_url, connect_args={"check_same_thread": False})

1081

1082 try:

1083 # Pragmas for fast bulk insert. `journal_mode = OFF` plus

1084 # `synchronous = OFF` is deliberately unsafe for general use but

1085 # correct here because durability is guaranteed by the temp-file

1086 # + os.replace() pattern around this block: we write to a unique

1087 # `.tmp-PID-RAND` path, and on any crash mid-build the incomplete

1088 # temp file is orphaned (and swept by `_sweep_stale_tmp_files()`

1089 # on the next build). The live file is only ever moved into place

1090 # by the atomic `os.replace()` at the bottom of this function —

1091 # it never sees a partial write. Do NOT copy this pragma set

1092 # elsewhere without the same atomic rename discipline.

1093 with engine.connect() as conn:

1094 conn.exec_driver_sql("PRAGMA journal_mode = OFF")

1095 conn.exec_driver_sql("PRAGMA synchronous = OFF")

1096 conn.exec_driver_sql("PRAGMA cache_size = -64000")

1097 conn.exec_driver_sql("PRAGMA page_size = 4096")

1098

1099 JournalQualityBase.metadata.create_all(engine)

1100

1101 SessionWrite = sessionmaker(bind=engine)

1102 with SessionWrite() as session:

1103 sources = _load_openalex(data_dir)

1104 doaj_data = _load_doaj(data_dir)

1105 pred_data = _load_predatory(data_dir)

1106 institutions = _load_institutions(data_dir)

1107 abbreviations = _load_abbreviations(data_dir)

1108

1109 _populate_predatory(session, pred_data)

1110 _populate_sources(session, sources, doaj_data, pred_data)

1111 _populate_institutions(session, institutions)

1112 _populate_abbreviations(session, abbreviations)

1113 session.commit()

1114

1115 with engine.connect() as conn:

1116 conn.exec_driver_sql("ANALYZE")

1117 conn.exec_driver_sql("VACUUM")

1118 # Stamp schema version so _ensure_engine can detect drift

1119 # without depending on the external version.json.

1120 conn.exec_driver_sql(

1121 f"PRAGMA user_version = {JOURNAL_QUALITY_SCHEMA_VERSION}"

1122 )

1123 except Exception:

1124 engine.dispose()

1125 if tmp_path.exists():

1126 try:

1127 # bearer:disable python_lang_file_permissions

1128 os.chmod(tmp_path, 0o644)

1129 tmp_path.unlink()

1130 except OSError:

1131 logger.exception(f"Failed to clean up tmp DB at {tmp_path}")

1132 raise

1133

1134 engine.dispose()

1135

1136 # Atomically swap tmp into place. os.replace is atomic on POSIX and

1137 # overwrites an existing output_path if present.

1138 if output_path.exists():

1139 # Prior file is chmod 0444 from the previous build — relax it

1140 # so os.replace can overwrite. Best-effort: if chmod fails

1141 # (e.g. read-only mount), os.replace will raise and surface

1142 # the real problem. Log so the cause is visible.

1143 try:

1144 # bearer:disable python_lang_file_permissions

1145 os.chmod(output_path, 0o644)

1146 except OSError:

1147 logger.warning(

1148 f"Could not chmod 0644 on existing {output_path} before "

1149 f"os.replace; if the replace fails this is likely why"

1150 )

1151 os.replace(tmp_path, output_path)

1152

1153 # OS-level read-only flag — third layer of write protection.

1154 # POSIX chmod is a no-op on Windows, so we also set the Windows

1155 # read-only file attribute via SetFileAttributesW. The pre-commit

1156 # hook check-journal-quality-readonly.py remains the primary

1157 # defense against accidental writable opens.

1158 # bearer:disable python_lang_file_permissions

1159 os.chmod(output_path, 0o444)

1160 if sys.platform == "win32":

1161 try:

1162 import ctypes

1163

1164 # FILE_ATTRIBUTE_READONLY = 0x1

1165 ok = ctypes.windll.kernel32.SetFileAttributesW(

1166 str(output_path), 0x1

1167 )

1168 if not ok:

1169 logger.warning(

1170 f"SetFileAttributesW failed on {output_path.name}; "

1171 "readonly pre-commit hook is the sole defense."

1172 )

1173 except Exception:

1174 logger.warning(

1175 f"Could not set Windows readonly attribute on "

1176 f"{output_path.name}"

1177 )

1178

1179 elapsed = time.time() - start

1180 size_mb = output_path.stat().st_size / (1024 * 1024)

1181 with sqlite3.connect(

1182 f"file:{output_path}?mode=ro&immutable=1", uri=True

1183 ) as _count_conn:

1184 source_count = _count_conn.execute(

1185 "SELECT COUNT(*) FROM sources"

1186 ).fetchone()[0]

1187 logger.info(

1188 f"Journal quality DB ready: {source_count} sources, "

1189 f"{size_mb:.1f} MB in {elapsed:.1f}s ({output_path.name}, chmod 0o444)"

1190 )

1191

1192 reset_db()

1193

1194

1195def _sweep_stale_tmp_files(directory: Path, base_name: str) -> None:

1196 """Remove journal_quality.db.tmp-* files older than 1h.

1197

1198 Per-file OSError (vanished between glob+stat, no permission, etc.)

1199 is logged at debug — the sweep is best-effort and shouldn't stop

1200 the build, but silent-pass on filesystem errors hides the cause of

1201 accumulating stale tmp files that would otherwise eat disk over

1202 time.

1203 """

1204 if not directory.exists():

1205 return

1206 cutoff = time.time() - 3600

1207 for tmp in directory.glob(f"{base_name}.tmp-*"):

1208 try:

1209 if tmp.stat().st_mtime < cutoff:

1210 tmp.unlink()

1211 logger.info(f"Swept stale temp build file: {tmp.name}")

1212 except OSError:

1213 logger.debug(f"Could not sweep stale tmp file {tmp.name}")

1214

1215

1216# ---------------------------------------------------------------------------

1217# Source-data loaders (used by build_db only)

1218# ---------------------------------------------------------------------------

1219

1220

1221def _load_openalex(data_dir: Path) -> dict:

1222 path = data_dir / "openalex_sources.json.gz"

1223 if not path.exists():

1224 raise FileNotFoundError(f"OpenAlex source file not found: {path}")

1225 with gzip.open(path, "rt", encoding="utf-8") as f:

1226 data = json.load(f)

1227 sources = data.get("s", data.get("sources", {}))

1228 logger.info(f"Loaded {len(sources)} OpenAlex sources")

1229 return dict(sources)

1230

1231

1232def _load_doaj(data_dir: Path) -> dict:

1233 path = data_dir / "doaj_journals.json"

1234 if not path.exists():

1235 logger.warning(f"{path} not found — DOAJ cross-ref will be skipped")

1236 return {}

1237 with open(path, encoding="utf-8") as f:

1238 data = json.load(f)

1239 journals = data.get("journals", {})

1240 logger.info(f"Loaded {len(journals)} DOAJ entries")

1241 return dict(journals)

1242

1243

1244def _load_predatory(data_dir: Path) -> dict:

1245 """Returns {journals: set, publishers: set, hijacked: set, long_pubs: list}."""

1246 path = data_dir / "predatory.json"

1247 if not path.exists():

1248 logger.warning(f"{path} not found — predatory check will be skipped")

1249 return {

1250 "journals": set(),

1251 "publishers": set(),

1252 "hijacked": set(),

1253 "long_pubs": [],

1254 }

1255

1256 with open(path, encoding="utf-8") as f:

1257 data = json.load(f)

1258

1259 journal_names = {

1260 normalize_name(e.get("name", ""))

1261 for e in data.get("journals", [])

1262 if e.get("name", "").strip()

1263 }

1264 publisher_names = {

1265 normalize_name(e.get("name", ""))

1266 for e in data.get("publishers", [])

1267 if e.get("name", "").strip()

1268 }

1269 hijacked_names = {

1270 normalize_name(e.get("hijacked_name", ""))

1271 for e in data.get("hijacked", [])

1272 if e.get("hijacked_name", "").strip()

1273 }

1274 long_pubs = [

1275 normalize_name(e.get("name", ""))

1276 for e in data.get("publishers", [])

1277 if len(e.get("name", "").strip()) >= 10

1278 ]

1279 logger.info(

1280 f"Loaded predatory: {len(journal_names)} journals, "

1281 f"{len(publisher_names)} publishers, "

1282 f"{len(hijacked_names)} hijacked"

1283 )

1284 return {

1285 "journals": journal_names,

1286 "publishers": publisher_names,

1287 "hijacked": hijacked_names,

1288 "long_pubs": long_pubs,

1289 }

1290

1291

1292def _load_institutions(data_dir: Path) -> dict:

1293 path = data_dir / "openalex_institutions.json.gz"

1294 if not path.exists():

1295 logger.warning(f"{path} not found — institution tier will be empty")

1296 return {}

1297 with gzip.open(path, "rt", encoding="utf-8") as f:

1298 data = json.load(f)

1299 institutions = data.get("i", {})

1300 logger.info(f"Loaded {len(institutions)} institutions")

1301 return dict(institutions)

1302

1303

1304def _load_abbreviations(data_dir: Path) -> dict:

1305 path = data_dir / "jabref_abbreviations.json.gz"

1306 if not path.exists():

1307 logger.warning(

1308 f"{path} not found — abbreviation expansion will be empty"

1309 )

1310 return {}

1311 with gzip.open(path, "rt", encoding="utf-8") as f:

1312 data = json.load(f)

1313 mappings = data.get("abbrev_to_full", {})

1314 logger.info(f"Loaded {len(mappings)} abbreviation mappings")

1315 return dict(mappings)

1316

1317

1318# ---------------------------------------------------------------------------

1319# Table populators

1320# ---------------------------------------------------------------------------

1321

1322

1323def _populate_predatory(session: Session, pred: dict) -> None:

1324 long_set = set(pred.get("long_pubs", []))

1325

1326 journals = [{"name_lower": n} for n in pred.get("journals", set()) if n]

1327 if journals:

1328 session.bulk_insert_mappings(inspect(PredatoryJournal), journals)

1329

1330 hijacked = [{"name_lower": n} for n in pred.get("hijacked", set()) if n]

1331 if hijacked:

1332 session.bulk_insert_mappings(inspect(PredatoryHijacked), hijacked)

1333

1334 publishers = [

1335 {"name_lower": n, "is_long": n in long_set}

1336 for n in pred.get("publishers", set())

1337 if n

1338 ]

1339 if publishers:

1340 session.bulk_insert_mappings(inspect(PredatoryPublisher), publishers)

1341

1342 logger.info(

1343 f"Inserted predatory tables: "

1344 f"{len(journals)} journals, "

1345 f"{len(publishers)} publishers, "

1346 f"{len(hijacked)} hijacked"

1347 )

1348

1349

1350def _populate_sources(

1351 session: Session,

1352 sources: dict,

1353 doaj_data: dict,

1354 pred: dict,

1355) -> None:

1356 """Build Source rows with cross-referenced DOAJ + predatory flags."""

1357 type_map = {"j": "journal", "c": "conference"}

1358 pred_journals = pred.get("journals", set())

1359 pred_publishers = pred.get("publishers", set())

1360 pred_hijacked = pred.get("hijacked", set())

1361

1362 # Dedup key is (name_lower, issn or "") so journals with separate

1363 # print and electronic ISSNs in OpenAlex both survive instead of

1364 # collapsing onto one row.

1365 seen: dict[tuple[str, str], dict] = {}

1366

1367 for source_id, compact in sources.items():

1368 name = (compact.get("n") or "").strip()

1369 if not name: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true

1370 continue

1371

1372 name_lower = normalize_name(name)

1373 issn = normalize_issn(compact.get("i"))

1374 publisher = compact.get("p") or None

1375 h_index = compact.get("h")

1376 impact_factor = compact.get("if")

1377 cited_by_count = compact.get("cb")

1378 source_type = type_map.get(compact.get("t", ""), compact.get("t", ""))

1379

1380 doaj_entry = doaj_data.get(issn) if issn else None

1381 is_in_doaj = doaj_entry is not None

1382

1383 is_pred = name_lower in pred_journals

1384 pred_source = "stop-predatory-journals" if is_pred else None

1385 if not is_pred and publisher:

1386 pub_norm = normalize_name(publisher)

1387 if pub_norm in pred_publishers: 1387 ↛ 1388line 1387 didn't jump to line 1388 because the condition on line 1387 was never true

1388 is_pred = True

1389 pred_source = "stop-predatory-publishers"

1390 if not is_pred and name_lower in pred_hijacked:

1391 is_pred = True

1392 pred_source = "stop-predatory-hijacked"

1393

1394 # Whitelist override

1395 if is_pred and ( 1395 ↛ 1398line 1395 didn't jump to line 1398 because the condition on line 1395 was never true

1396 is_in_doaj or (h_index or 0) > PREDATORY_WHITELIST_HINDEX

1397 ):

1398 is_pred = False

1399 pred_source = None

1400

1401 quality = derive_quality_score(

1402 h_index=h_index,

1403 is_in_doaj=is_in_doaj,

1404 is_predatory=is_pred,

1405 source_type=source_type,

1406 )

1407

1408 rec = {

1409 "name": name,

1410 "name_lower": name_lower,

1411 "issn": issn,

1412 "openalex_source_id": source_id,

1413 "source_type": source_type,

1414 "publisher": publisher,

1415 "h_index": h_index,

1416 "impact_factor": impact_factor,

1417 "cited_by_count": cited_by_count,

1418 "quartile": None, # filled in by the post-pass below

1419 "quality": quality,

1420 "is_in_doaj": is_in_doaj,

1421 "is_predatory": is_pred,

1422 "predatory_source": pred_source,

1423 "score_source": "openalex",

1424 }

1425

1426 key = (name_lower, issn or "")

1427 prev = seen.get(key)

1428 if prev is None or (h_index or 0) > (prev.get("h_index") or 0): 1428 ↛ 1367line 1428 didn't jump to line 1367 because the condition on line 1428 was always true

1429 seen[key] = rec

1430

1431 # Second pass: DOAJ-only journals (not in OpenAlex). Without this

1432 # we lose ~4-7K small open-access venues. Keyed by name_lower so

1433 # we don't double-insert anything OpenAlex already covered.

1434 openalex_names = {k[0] for k in seen.keys()}

1435 doaj_added = 0

1436 for issn, doaj_entry in doaj_data.items():

1437 name = (doaj_entry.get("name") or "").strip()

1438 if not name: 1438 ↛ 1439line 1438 didn't jump to line 1439 because the condition on line 1438 was never true

1439 continue

1440 name_lower = normalize_name(name)

1441 if name_lower in openalex_names:

1442 continue

1443 publisher = doaj_entry.get("publisher") or None

1444 quality = derive_quality_score(

1445 h_index=None,

1446 is_in_doaj=True,

1447 is_predatory=False,

1448 source_type="journal",

1449 )

1450 seen[(name_lower, issn or "")] = {

1451 "name": name,

1452 "name_lower": name_lower,

1453 "issn": issn,

1454 "openalex_source_id": None,

1455 "source_type": "journal",

1456 "publisher": publisher,

1457 "h_index": None,

1458 "impact_factor": None,

1459 "cited_by_count": None,

1460 "quartile": None,

1461 "quality": quality,

1462 "is_in_doaj": True,

1463 "is_predatory": False,

1464 "predatory_source": None,

1465 "score_source": "doaj",

1466 }

1467 openalex_names.add(name_lower)

1468 doaj_added += 1

1469

1470 records = list(seen.values())

1471

1472 # Derive quartile (Q1–Q4) from cited_by_count percentile within each

1473 # source_type. Field-specific quartiles would be more accurate but

1474 # require per-source topic data that 4–7×s the snapshot size, so we

1475 # use global per-type percentiles as a defensible approximation

1476 # given the license constraint that ruled out SJR.

1477 by_type: dict[str, list[dict]] = {}

1478 for r in records:

1479 if r.get("cited_by_count") is None:

1480 continue # NULL quartile for entries without citation data

1481 by_type.setdefault(r.get("source_type") or "", []).append(r)

1482 for type_records in by_type.values():

1483 type_records.sort(key=lambda r: r["cited_by_count"])

1484 n = len(type_records)

1485 if n == 0: 1485 ↛ 1486line 1485 didn't jump to line 1486 because the condition on line 1485 was never true

1486 continue

1487 for rank, r in enumerate(type_records):

1488 pct = rank / n # 0.0 = lowest, ~1.0 = highest

1489 if pct >= 0.75:

1490 r["quartile"] = "Q1"

1491 elif pct >= 0.50:

1492 r["quartile"] = "Q2"

1493 elif pct >= 0.25:

1494 r["quartile"] = "Q3"

1495 else:

1496 r["quartile"] = "Q4"

1497

1498 # Re-derive quality now that quartile is available. The first-pass

1499 # `quality` values above were computed without quartile and are

1500 # therefore suboptimal — a Q1 journal without h-index data would

1501 # have scored `None` (fall-through) instead of 8. The runtime filter

1502 # code in journal_reputation_filter.py does pass quartile, so the

1503 # stored column should agree with the live score.

1504 for r in records:

1505 r["quality"] = derive_quality_score(

1506 h_index=r.get("h_index"),

1507 quartile=r.get("quartile"),

1508 is_in_doaj=r.get("is_in_doaj") or False,

1509 is_predatory=r.get("is_predatory") or False,

1510 source_type=r.get("source_type"),

1511 )

1512

1513 logger.info(

1514 f"Inserting {len(records)} source records ({doaj_added} DOAJ-only)..."

1515 )

1516 for i in range(0, len(records), _BATCH_SIZE):

1517 session.bulk_insert_mappings(

1518 inspect(Source), records[i : i + _BATCH_SIZE]

1519 )

1520

1521

1522def _populate_institutions(session: Session, institutions: dict) -> None:

1523 records: list[dict] = []

1524 for inst_id, compact in institutions.items():

1525 name = (compact.get("n") or "").strip()

1526 if not name: 1526 ↛ 1527line 1526 didn't jump to line 1527 because the condition on line 1526 was never true

1527 continue

1528 records.append(

1529 {

1530 "openalex_id": inst_id,

1531 "name": name,

1532 "name_lower": normalize_name(name),

1533 "ror_id": compact.get("r"),

1534 "country": compact.get("c"),

1535 "type": compact.get("t"),

1536 "h_index": compact.get("h"),

1537 "impact_factor": compact.get("if"),

1538 "works_count": compact.get("w"),

1539 "cited_by_count": compact.get("cb"),

1540 }

1541 )

1542 logger.info(f"Inserting {len(records)} institution records...")

1543 for i in range(0, len(records), _BATCH_SIZE):

1544 session.bulk_insert_mappings(

1545 inspect(Institution), records[i : i + _BATCH_SIZE]

1546 )

1547

1548

1549def _populate_abbreviations(session: Session, mappings: dict) -> None:

1550 records: list[dict] = []

1551 seen: set[str] = set()

1552 for abbrev, full in mappings.items():

1553 norm = normalize_name(abbrev)

1554 if not norm or norm in seen:

1555 continue

1556 seen.add(norm)

1557 records.append({"abbrev_lower": norm, "full_name": full})

1558 logger.info(f"Inserting {len(records)} abbreviation records...")

1559 for i in range(0, len(records), _BATCH_SIZE):

1560 session.bulk_insert_mappings(

1561 inspect(Abbreviation), records[i : i + _BATCH_SIZE]

1562 )

1563

1564

1565# ---------------------------------------------------------------------------

1566# Backwards-compat shim for the old build_reference_db name

1567# ---------------------------------------------------------------------------

1568

1569

1570def build_reference_db(

1571 data_dir: Optional[Path] = None,

1572 output_path: Optional[Path] = None,

1573) -> None:

1574 """Deprecated alias for `build_db`."""

1575 build_db(data_dir=data_dir, output_path=output_path)

Coverage for src/local_deep_research/journal_quality/db.py: 46%

666 statements