Coverage for src/local_deep_research/journal_quality/data_sources/predatory.py: 89%

57 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Stop Predatory Journals data source. 

2 

3Community successor to Jeffrey Beall's original predatory publishers 

4list (Beall took down his original blog post in 2017). The successor 

5project maintains three CSV files (publishers, journals, hijacked) on 

6GitHub which we merge into a single predatory.json. 

7""" 

8 

9from __future__ import annotations 

10 

11import csv 

12import io 

13import json 

14from pathlib import Path 

15 

16from loguru import logger 

17 

18from .base import DataSource 

19 

20_PREDATORY_BASE = ( 

21 "https://raw.githubusercontent.com/stop-predatory-journals/" 

22 "stop-predatory-journals.github.io/master/_data" 

23) 

24_PREDATORY_FILES = { 

25 "publishers": "publishers.csv", 

26 "journals": "journals.csv", 

27 "hijacked": "hijacked.csv", 

28} 

29 

30# Safety floor — the upstream lists carry thousands of entries each. 

31# Refuse to overwrite a healthy on-disk snapshot with a near-empty 

32# payload (e.g. CDN partial outage where two of the three CSVs 

33# returned 0 rows) since that would silently disable predatory 

34# filtering for everyone. 

35_MIN_PREDATORY_TOTAL = 100 

36 

37 

38class PredatorySource(DataSource): 

39 key = "predatory" # gitleaks:allow 

40 name = "Stop Predatory Journals" 

41 url = ( 

42 "https://github.com/stop-predatory-journals/" 

43 "stop-predatory-journals.github.io" 

44 ) 

45 dataset_url = ( 

46 "https://github.com/stop-predatory-journals/" 

47 "stop-predatory-journals.github.io/tree/master/_data" 

48 ) 

49 license = "MIT" 

50 license_url = "https://opensource.org/license/mit" 

51 description = ( 

52 "Community successor to Beall's List — predatory publishers, " 

53 "journals, and hijacked journal entries" 

54 ) 

55 filename = "predatory.json" 

56 count_label = "predatory entries" 

57 auto_download = True # ~0.3 MB; fetch on first filter use 

58 required = False 

59 approx_size_mb = 0.3 

60 

61 def fetch(self, data_dir: Path, progress_cb=None) -> int: 

62 from ...security.safe_requests import ( 

63 safe_get_with_retries as safe_get, 

64 ) 

65 

66 publishers: list[dict] = [] 

67 journals: list[dict] = [] 

68 hijacked: list[dict] = [] 

69 

70 def _read_csv(filename: str) -> list[dict]: 

71 url = f"{_PREDATORY_BASE}/{filename}" 

72 resp = safe_get(url, timeout=30, consume_body=True) 

73 resp.raise_for_status() 

74 reader = csv.DictReader(io.StringIO(resp.text)) 

75 return [ 

76 {k: (v or "").strip() for k, v in row.items() if k} 

77 for row in reader 

78 ] 

79 

80 for row in _read_csv(_PREDATORY_FILES["publishers"]): 

81 name = row.get("name", "") 

82 if name: 82 ↛ 80line 82 didn't jump to line 80 because the condition on line 82 was always true

83 publishers.append({"name": name, "url": row.get("url", "")}) 

84 

85 for row in _read_csv(_PREDATORY_FILES["journals"]): 

86 name = row.get("name", "") 

87 if name: 87 ↛ 85line 87 didn't jump to line 85 because the condition on line 87 was always true

88 journals.append({"name": name, "url": row.get("url", "")}) 

89 

90 for row in _read_csv(_PREDATORY_FILES["hijacked"]): 90 ↛ 94line 90 didn't jump to line 94 because the loop on line 90 never started

91 # Upstream column names: hijacked, hijackedabbr, hijackedurl, 

92 # althijackedurl, authentic, authenticabbr, authenticurl. 

93 # The rest of the codebase reads `hijacked_name`, so map across. 

94 name = row.get("hijacked", "") 

95 if name: 

96 hijacked.append( 

97 { 

98 "hijacked_name": name, 

99 "original_name": row.get("authentic", ""), 

100 "hijacked_url": row.get("hijackedurl", ""), 

101 "original_url": row.get("authenticurl", ""), 

102 } 

103 ) 

104 

105 payload = { 

106 "metadata": { 

107 "source": ( 

108 "Stop Predatory Journals " 

109 "(https://github.com/stop-predatory-journals/" 

110 "stop-predatory-journals.github.io) — " 

111 "community successor to Beall's List" 

112 ), 

113 "license": "MIT", 

114 "publisher_count": len(publishers), 

115 "journal_count": len(journals), 

116 "hijacked_count": len(hijacked), 

117 }, 

118 "publishers": publishers, 

119 "journals": journals, 

120 "hijacked": hijacked, 

121 } 

122 

123 total = len(publishers) + len(journals) + len(hijacked) 

124 if total < _MIN_PREDATORY_TOTAL: 

125 raise RuntimeError( 

126 f"Predatory: suspiciously few records " 

127 f"({total} < {_MIN_PREDATORY_TOTAL}); refusing to " 

128 "overwrite existing data" 

129 ) 

130 

131 output = data_dir / self.filename 

132 tmp = data_dir / f"{self.filename}.tmp" 

133 with open(tmp, "w", encoding="utf-8") as f: 

134 json.dump(payload, f) 

135 tmp.rename(output) 

136 

137 logger.info( 

138 f"Predatory: saved {len(publishers)} publishers + " 

139 f"{len(journals)} journals + {len(hijacked)} hijacked" 

140 ) 

141 return total