Coverage for src/local_deep_research/journal_quality/data_sources/doaj.py: 49%
55 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""DOAJ (Directory of Open Access Journals) data source.
3Downloads the **public CSV dump** of all DOAJ journals from
4``https://doaj.org/csv`` — a single HTTP GET, no auth, no rate
5limits, ~25 MB, ~22K journals. This replaces the previous paginated
6``/api/search/journals`` implementation, which required hundreds of
7requests with polite sleeps between them.
8"""
10from __future__ import annotations
12import csv
13import io
14import json
15import time
16from pathlib import Path
18from loguru import logger
20from ...utilities.citation_normalizer import normalize_issn
21from .base import DataSource
23# Public CSV of the full DOAJ journal list. CC0 metadata.
24_DOAJ_CSV_URL = "https://doaj.org/csv"
26# Column headers in the DOAJ public CSV (as of the current schema).
27# DOAJ has historically been stable about these but we look them up
28# by header name so a column reorder doesn't break us.
29_COL_TITLE = "Journal title"
30_COL_PISSN = "Journal ISSN (print version)"
31_COL_EISSN = "Journal EISSN (online version)"
32_COL_SEAL = "DOAJ Seal"
33_COL_PUBLISHER = "Publisher"
35# Safety floor — DOAJ has ~22K journals. A fetch that returns far fewer
36# records almost certainly indicates a schema change upstream (e.g.
37# column rename breaking ISSN lookups) and should NOT overwrite the
38# existing good data file.
39_MIN_DOAJ_JOURNALS = 5_000
42class DOAJSource(DataSource):
43 key = "doaj" # gitleaks:allow
44 name = "Directory of Open Access Journals"
45 url = "https://doaj.org"
46 dataset_url = "https://doaj.org/docs/public-data-dump"
47 license = "CC0 (metadata)"
48 license_url = "https://creativecommons.org/publicdomain/zero/1.0/"
49 description = "~22K verified open access journals with DOAJ Seal status"
50 filename = "doaj_journals.json"
51 count_label = "DOAJ journals"
52 auto_download = False
53 required = False # best-effort
54 approx_size_mb = 5.0
56 def fetch(self, data_dir: Path, progress_cb=None) -> int:
57 from ...security.safe_requests import (
58 safe_get_with_retries as safe_get,
59 )
61 logger.info(f"Fetching DOAJ public CSV dump: {_DOAJ_CSV_URL}")
62 start = time.time()
63 # consume_body: the CSV is ~25 MB, so a mid-stream
64 # ChunkedEncodingError / ReadTimeout is a realistic failure
65 # mode worth retrying. Without this flag the body-read fires
66 # outside safe_get_with_retries' retry loop.
67 resp = safe_get(_DOAJ_CSV_URL, timeout=120, consume_body=True)
68 resp.raise_for_status()
70 # DOAJ serves UTF-8 CSV. Parse in-memory — the whole file is
71 # ~25 MB and we need random column access.
72 text = resp.content.decode("utf-8", errors="replace")
73 reader = csv.DictReader(io.StringIO(text))
75 journals: dict = {}
76 for row in reader:
77 # Prefer print ISSN, fall back to electronic. The CSV uses
78 # empty strings for missing values. Normalize to the
79 # 8-char no-dash canonical form so lookups (which also
80 # normalize) match regardless of the upstream format.
81 raw_issn = (row.get(_COL_PISSN) or "").strip() or (
82 row.get(_COL_EISSN) or ""
83 ).strip()
84 issn = normalize_issn(raw_issn)
85 if not issn:
86 continue
88 # DOAJ's seal column is actually ternary in the CSV: "yes",
89 # "no", or blank (never applied). The current consumer
90 # (scoring.py) only needs the boolean floor, so we collapse
91 # blank and "no" into False here. If a future scoring tier
92 # wants to distinguish "applied and was denied" from "never
93 # applied", preserve the raw value in a new dict key and
94 # plumb it through the Source ORM.
95 seal_raw = (row.get(_COL_SEAL) or "").strip().lower()
96 journals[issn] = {
97 "name": (row.get(_COL_TITLE) or "").strip(),
98 "has_seal": seal_raw in ("yes", "true", "1"),
99 "publisher": (row.get(_COL_PUBLISHER) or "").strip(),
100 }
102 if len(journals) < _MIN_DOAJ_JOURNALS:
103 raise RuntimeError(
104 f"DOAJ: suspiciously few journals "
105 f"({len(journals):,} < {_MIN_DOAJ_JOURNALS:,}); "
106 "refusing to overwrite existing data. "
107 "Possible CSV schema change upstream."
108 )
110 output = data_dir / self.filename
111 tmp = data_dir / f"{self.filename}.tmp"
112 with open(tmp, "w", encoding="utf-8") as f:
113 json.dump({"journals": journals}, f)
114 tmp.rename(output)
116 elapsed = time.time() - start
117 logger.info(f"DOAJ: saved {len(journals):,} journals in {elapsed:.0f}s")
118 return len(journals)