Coverage for src/local_deep_research/journal_quality/data

1"""DOAJ (Directory of Open Access Journals) data source.

3Downloads the **public CSV dump** of all DOAJ journals from

4``https://doaj.org/csv`` — a single HTTP GET, no auth, no rate

5limits, ~25 MB, ~22K journals. This replaces the previous paginated

6``/api/search/journals`` implementation, which required hundreds of

7requests with polite sleeps between them.

8"""

10from __future__ import annotations

12import csv

13import io

14import json

15import time

16from pathlib import Path

18from loguru import logger

20from ...utilities.citation_normalizer import normalize_issn

21from .base import DataSource

23# Public CSV of the full DOAJ journal list. CC0 metadata.

24_DOAJ_CSV_URL = "https://doaj.org/csv"

26# Column headers in the DOAJ public CSV (as of the current schema).

27# DOAJ has historically been stable about these but we look them up

28# by header name so a column reorder doesn't break us.

29_COL_TITLE = "Journal title"

30_COL_PISSN = "Journal ISSN (print version)"

31_COL_EISSN = "Journal EISSN (online version)"

32_COL_PUBLISHER = "Publisher"

33# NB: the "DOAJ Seal" column is intentionally no longer parsed — DOAJ

34# retired the Seal in April 2025 and removed it from their metadata, so

35# the column only ever yields blanks now:

36# https://blog.doaj.org/2025/04/09/our-metadata-changes-are-live-and-the-seal-has-been-retired/

38# Safety floor — DOAJ has ~22K journals. A fetch that returns far fewer

39# records almost certainly indicates a schema change upstream (e.g.

40# column rename breaking ISSN lookups) and should NOT overwrite the

41# existing good data file.

42_MIN_DOAJ_JOURNALS = 5_000

45class DOAJSource(DataSource):

46 key = "doaj" # gitleaks:allow

47 name = "Directory of Open Access Journals"

48 url = "https://doaj.org"

49 dataset_url = "https://doaj.org/docs/public-data-dump"

50 license = "CC0 (metadata)"

51 license_url = "https://creativecommons.org/publicdomain/zero/1.0/"

52 description = "~22K verified open access journals"

53 filename = "doaj_journals.json"

54 count_label = "DOAJ journals"

55 auto_download = False

56 required = False # best-effort

57 approx_size_mb = 5.0

59 def fetch(self, data_dir: Path, progress_cb=None) -> int:

60 from ...security.safe_requests import (

61 safe_get_with_retries as safe_get,

62 )

64 logger.info(f"Fetching DOAJ public CSV dump: {_DOAJ_CSV_URL}")

65 start = time.time()

66 # consume_body: the CSV is ~25 MB, so a mid-stream

67 # ChunkedEncodingError / ReadTimeout is a realistic failure

68 # mode worth retrying. Without this flag the body-read fires

69 # outside safe_get_with_retries' retry loop.

70 resp = safe_get(_DOAJ_CSV_URL, timeout=120, consume_body=True)

71 resp.raise_for_status()

73 # DOAJ serves UTF-8 CSV. Parse in-memory — the whole file is

74 # ~25 MB and we need random column access.

75 text = resp.content.decode("utf-8", errors="replace")

76 reader = csv.DictReader(io.StringIO(text))

78 journals: dict = {}

79 for row in reader:

80 # Prefer print ISSN, fall back to electronic. The CSV uses

81 # empty strings for missing values. Normalize to the

82 # 8-char no-dash canonical form so lookups (which also

83 # normalize) match regardless of the upstream format.

84 raw_issn = (row.get(_COL_PISSN) or "").strip() or (

85 row.get(_COL_EISSN) or ""

86 ).strip()

87 issn = normalize_issn(raw_issn)

88 if not issn:

89 continue

91 journals[issn] = {

92 "name": (row.get(_COL_TITLE) or "").strip(),

93 "publisher": (row.get(_COL_PUBLISHER) or "").strip(),

94 }

96 if len(journals) < _MIN_DOAJ_JOURNALS:

97 raise RuntimeError(

98 f"DOAJ: suspiciously few journals "

99 f"({len(journals):,} < {_MIN_DOAJ_JOURNALS:,}); "

100 "refusing to overwrite existing data. "

101 "Possible CSV schema change upstream."

102 )

103

104 output = data_dir / self.filename

105 tmp = data_dir / f"{self.filename}.tmp"

106 with open(tmp, "w", encoding="utf-8") as f:

107 json.dump({"journals": journals}, f)

108 tmp.rename(output)

109

110 elapsed = time.time() - start

111 logger.info(f"DOAJ: saved {len(journals):,} journals in {elapsed:.0f}s")

112 return len(journals)

Coverage for src/local_deep_research/journal_quality/data_sources/doaj.py: 49%

53 statements