Coverage for src/local_deep_research/journal_quality/data_sources/predatory.py: 89%
57 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Stop Predatory Journals data source.
3Community successor to Jeffrey Beall's original predatory publishers
4list (Beall took down his original blog post in 2017). The successor
5project maintains three CSV files (publishers, journals, hijacked) on
6GitHub which we merge into a single predatory.json.
7"""
9from __future__ import annotations
11import csv
12import io
13import json
14from pathlib import Path
16from loguru import logger
18from .base import DataSource
20_PREDATORY_BASE = (
21 "https://raw.githubusercontent.com/stop-predatory-journals/"
22 "stop-predatory-journals.github.io/master/_data"
23)
24_PREDATORY_FILES = {
25 "publishers": "publishers.csv",
26 "journals": "journals.csv",
27 "hijacked": "hijacked.csv",
28}
30# Safety floor — the upstream lists carry thousands of entries each.
31# Refuse to overwrite a healthy on-disk snapshot with a near-empty
32# payload (e.g. CDN partial outage where two of the three CSVs
33# returned 0 rows) since that would silently disable predatory
34# filtering for everyone.
35_MIN_PREDATORY_TOTAL = 100
38class PredatorySource(DataSource):
39 key = "predatory" # gitleaks:allow
40 name = "Stop Predatory Journals"
41 url = (
42 "https://github.com/stop-predatory-journals/"
43 "stop-predatory-journals.github.io"
44 )
45 dataset_url = (
46 "https://github.com/stop-predatory-journals/"
47 "stop-predatory-journals.github.io/tree/master/_data"
48 )
49 license = "MIT"
50 license_url = "https://opensource.org/license/mit"
51 description = (
52 "Community successor to Beall's List — predatory publishers, "
53 "journals, and hijacked journal entries"
54 )
55 filename = "predatory.json"
56 count_label = "predatory entries"
57 auto_download = True # ~0.3 MB; fetch on first filter use
58 required = False
59 approx_size_mb = 0.3
61 def fetch(self, data_dir: Path, progress_cb=None) -> int:
62 from ...security.safe_requests import (
63 safe_get_with_retries as safe_get,
64 )
66 publishers: list[dict] = []
67 journals: list[dict] = []
68 hijacked: list[dict] = []
70 def _read_csv(filename: str) -> list[dict]:
71 url = f"{_PREDATORY_BASE}/{filename}"
72 resp = safe_get(url, timeout=30, consume_body=True)
73 resp.raise_for_status()
74 reader = csv.DictReader(io.StringIO(resp.text))
75 return [
76 {k: (v or "").strip() for k, v in row.items() if k}
77 for row in reader
78 ]
80 for row in _read_csv(_PREDATORY_FILES["publishers"]):
81 name = row.get("name", "")
82 if name: 82 ↛ 80line 82 didn't jump to line 80 because the condition on line 82 was always true
83 publishers.append({"name": name, "url": row.get("url", "")})
85 for row in _read_csv(_PREDATORY_FILES["journals"]):
86 name = row.get("name", "")
87 if name: 87 ↛ 85line 87 didn't jump to line 85 because the condition on line 87 was always true
88 journals.append({"name": name, "url": row.get("url", "")})
90 for row in _read_csv(_PREDATORY_FILES["hijacked"]): 90 ↛ 94line 90 didn't jump to line 94 because the loop on line 90 never started
91 # Upstream column names: hijacked, hijackedabbr, hijackedurl,
92 # althijackedurl, authentic, authenticabbr, authenticurl.
93 # The rest of the codebase reads `hijacked_name`, so map across.
94 name = row.get("hijacked", "")
95 if name:
96 hijacked.append(
97 {
98 "hijacked_name": name,
99 "original_name": row.get("authentic", ""),
100 "hijacked_url": row.get("hijackedurl", ""),
101 "original_url": row.get("authenticurl", ""),
102 }
103 )
105 payload = {
106 "metadata": {
107 "source": (
108 "Stop Predatory Journals "
109 "(https://github.com/stop-predatory-journals/"
110 "stop-predatory-journals.github.io) — "
111 "community successor to Beall's List"
112 ),
113 "license": "MIT",
114 "publisher_count": len(publishers),
115 "journal_count": len(journals),
116 "hijacked_count": len(hijacked),
117 },
118 "publishers": publishers,
119 "journals": journals,
120 "hijacked": hijacked,
121 }
123 total = len(publishers) + len(journals) + len(hijacked)
124 if total < _MIN_PREDATORY_TOTAL:
125 raise RuntimeError(
126 f"Predatory: suspiciously few records "
127 f"({total} < {_MIN_PREDATORY_TOTAL}); refusing to "
128 "overwrite existing data"
129 )
131 output = data_dir / self.filename
132 tmp = data_dir / f"{self.filename}.tmp"
133 with open(tmp, "w", encoding="utf-8") as f:
134 json.dump(payload, f)
135 tmp.rename(output)
137 logger.info(
138 f"Predatory: saved {len(publishers)} publishers + "
139 f"{len(journals)} journals + {len(hijacked)} hijacked"
140 )
141 return total