Coverage for src/local_deep_research/journal_quality/data_sources/doaj.py: 49%

55 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""DOAJ (Directory of Open Access Journals) data source. 

2 

3Downloads the **public CSV dump** of all DOAJ journals from 

4``https://doaj.org/csv`` — a single HTTP GET, no auth, no rate 

5limits, ~25 MB, ~22K journals. This replaces the previous paginated 

6``/api/search/journals`` implementation, which required hundreds of 

7requests with polite sleeps between them. 

8""" 

9 

10from __future__ import annotations 

11 

12import csv 

13import io 

14import json 

15import time 

16from pathlib import Path 

17 

18from loguru import logger 

19 

20from ...utilities.citation_normalizer import normalize_issn 

21from .base import DataSource 

22 

23# Public CSV of the full DOAJ journal list. CC0 metadata. 

24_DOAJ_CSV_URL = "https://doaj.org/csv" 

25 

26# Column headers in the DOAJ public CSV (as of the current schema). 

27# DOAJ has historically been stable about these but we look them up 

28# by header name so a column reorder doesn't break us. 

29_COL_TITLE = "Journal title" 

30_COL_PISSN = "Journal ISSN (print version)" 

31_COL_EISSN = "Journal EISSN (online version)" 

32_COL_SEAL = "DOAJ Seal" 

33_COL_PUBLISHER = "Publisher" 

34 

35# Safety floor — DOAJ has ~22K journals. A fetch that returns far fewer 

36# records almost certainly indicates a schema change upstream (e.g. 

37# column rename breaking ISSN lookups) and should NOT overwrite the 

38# existing good data file. 

39_MIN_DOAJ_JOURNALS = 5_000 

40 

41 

42class DOAJSource(DataSource): 

43 key = "doaj" # gitleaks:allow 

44 name = "Directory of Open Access Journals" 

45 url = "https://doaj.org" 

46 dataset_url = "https://doaj.org/docs/public-data-dump" 

47 license = "CC0 (metadata)" 

48 license_url = "https://creativecommons.org/publicdomain/zero/1.0/" 

49 description = "~22K verified open access journals with DOAJ Seal status" 

50 filename = "doaj_journals.json" 

51 count_label = "DOAJ journals" 

52 auto_download = False 

53 required = False # best-effort 

54 approx_size_mb = 5.0 

55 

56 def fetch(self, data_dir: Path, progress_cb=None) -> int: 

57 from ...security.safe_requests import ( 

58 safe_get_with_retries as safe_get, 

59 ) 

60 

61 logger.info(f"Fetching DOAJ public CSV dump: {_DOAJ_CSV_URL}") 

62 start = time.time() 

63 # consume_body: the CSV is ~25 MB, so a mid-stream 

64 # ChunkedEncodingError / ReadTimeout is a realistic failure 

65 # mode worth retrying. Without this flag the body-read fires 

66 # outside safe_get_with_retries' retry loop. 

67 resp = safe_get(_DOAJ_CSV_URL, timeout=120, consume_body=True) 

68 resp.raise_for_status() 

69 

70 # DOAJ serves UTF-8 CSV. Parse in-memory — the whole file is 

71 # ~25 MB and we need random column access. 

72 text = resp.content.decode("utf-8", errors="replace") 

73 reader = csv.DictReader(io.StringIO(text)) 

74 

75 journals: dict = {} 

76 for row in reader: 

77 # Prefer print ISSN, fall back to electronic. The CSV uses 

78 # empty strings for missing values. Normalize to the 

79 # 8-char no-dash canonical form so lookups (which also 

80 # normalize) match regardless of the upstream format. 

81 raw_issn = (row.get(_COL_PISSN) or "").strip() or ( 

82 row.get(_COL_EISSN) or "" 

83 ).strip() 

84 issn = normalize_issn(raw_issn) 

85 if not issn: 

86 continue 

87 

88 # DOAJ's seal column is actually ternary in the CSV: "yes", 

89 # "no", or blank (never applied). The current consumer 

90 # (scoring.py) only needs the boolean floor, so we collapse 

91 # blank and "no" into False here. If a future scoring tier 

92 # wants to distinguish "applied and was denied" from "never 

93 # applied", preserve the raw value in a new dict key and 

94 # plumb it through the Source ORM. 

95 seal_raw = (row.get(_COL_SEAL) or "").strip().lower() 

96 journals[issn] = { 

97 "name": (row.get(_COL_TITLE) or "").strip(), 

98 "has_seal": seal_raw in ("yes", "true", "1"), 

99 "publisher": (row.get(_COL_PUBLISHER) or "").strip(), 

100 } 

101 

102 if len(journals) < _MIN_DOAJ_JOURNALS: 

103 raise RuntimeError( 

104 f"DOAJ: suspiciously few journals " 

105 f"({len(journals):,} < {_MIN_DOAJ_JOURNALS:,}); " 

106 "refusing to overwrite existing data. " 

107 "Possible CSV schema change upstream." 

108 ) 

109 

110 output = data_dir / self.filename 

111 tmp = data_dir / f"{self.filename}.tmp" 

112 with open(tmp, "w", encoding="utf-8") as f: 

113 json.dump({"journals": journals}, f) 

114 tmp.rename(output) 

115 

116 elapsed = time.time() - start 

117 logger.info(f"DOAJ: saved {len(journals):,} journals in {elapsed:.0f}s") 

118 return len(journals)