Coverage for src/local_deep_research/journal_quality/data

1"""Stop Predatory Journals data source.

3Community successor to Jeffrey Beall's original predatory publishers

4list (Beall took down his original blog post in 2017). The successor

5project maintains three CSV files (publishers, journals, hijacked) on

6GitHub which we merge into a single predatory.json.

7"""

9from __future__ import annotations

11import csv

12import io

13import json

14from pathlib import Path

16from loguru import logger

18from .base import DataSource

20_PREDATORY_BASE = (

21 "https://raw.githubusercontent.com/stop-predatory-journals/"

22 "stop-predatory-journals.github.io/master/_data"

23)

24_PREDATORY_FILES = {

25 "publishers": "publishers.csv",

26 "journals": "journals.csv",

27 "hijacked": "hijacked.csv",

28}

30# Safety floor — the upstream lists carry thousands of entries each.

31# Refuse to overwrite a healthy on-disk snapshot with a near-empty

32# payload (e.g. CDN partial outage where two of the three CSVs

33# returned 0 rows) since that would silently disable predatory

34# filtering for everyone.

35_MIN_PREDATORY_TOTAL = 100

38class PredatorySource(DataSource):

39 key = "predatory" # gitleaks:allow

40 name = "Stop Predatory Journals"

41 url = (

42 "https://github.com/stop-predatory-journals/"

43 "stop-predatory-journals.github.io"

44 )

45 dataset_url = (

46 "https://github.com/stop-predatory-journals/"

47 "stop-predatory-journals.github.io/tree/master/_data"

48 )

49 license = "MIT"

50 license_url = "https://opensource.org/license/mit"

51 description = (

52 "Community successor to Beall's List — predatory publishers, "

53 "journals, and hijacked journal entries"

54 )

55 filename = "predatory.json"

56 count_label = "predatory entries"

57 auto_download = True # ~0.3 MB; fetch on first filter use

58 required = False

59 approx_size_mb = 0.3

61 def fetch(self, data_dir: Path, progress_cb=None) -> int:

62 from ...security.safe_requests import (

63 safe_get_with_retries as safe_get,

64 )

66 publishers: list[dict] = []

67 journals: list[dict] = []

68 hijacked: list[dict] = []

70 def _read_csv(filename: str) -> list[dict]:

71 url = f"{_PREDATORY_BASE}/{filename}"

72 resp = safe_get(url, timeout=30, consume_body=True)

73 resp.raise_for_status()

74 reader = csv.DictReader(io.StringIO(resp.text))

75 return [

76 {k: (v or "").strip() for k, v in row.items() if k}

77 for row in reader

78 ]

80 for row in _read_csv(_PREDATORY_FILES["publishers"]):

81 name = row.get("name", "")

82 if name: 82 ↛ 80line 82 didn't jump to line 80 because the condition on line 82 was always true

83 publishers.append({"name": name, "url": row.get("url", "")})

85 for row in _read_csv(_PREDATORY_FILES["journals"]):

86 name = row.get("name", "")

87 if name: 87 ↛ 85line 87 didn't jump to line 85 because the condition on line 87 was always true

88 journals.append({"name": name, "url": row.get("url", "")})

90 for row in _read_csv(_PREDATORY_FILES["hijacked"]): 90 ↛ 94line 90 didn't jump to line 94 because the loop on line 90 never started

91 # Upstream column names: hijacked, hijackedabbr, hijackedurl,

92 # althijackedurl, authentic, authenticabbr, authenticurl.

93 # The rest of the codebase reads `hijacked_name`, so map across.

94 name = row.get("hijacked", "")

95 if name:

96 hijacked.append(

97 {

98 "hijacked_name": name,

99 "original_name": row.get("authentic", ""),

100 "hijacked_url": row.get("hijackedurl", ""),

101 "original_url": row.get("authenticurl", ""),

102 }

103 )

104

105 payload = {

106 "metadata": {

107 "source": (

108 "Stop Predatory Journals "

109 "(https://github.com/stop-predatory-journals/"

110 "stop-predatory-journals.github.io) — "

111 "community successor to Beall's List"

112 ),

113 "license": "MIT",

114 "publisher_count": len(publishers),

115 "journal_count": len(journals),

116 "hijacked_count": len(hijacked),

117 },

118 "publishers": publishers,

119 "journals": journals,

120 "hijacked": hijacked,

121 }

122

123 total = len(publishers) + len(journals) + len(hijacked)

124 if total < _MIN_PREDATORY_TOTAL:

125 raise RuntimeError(

126 f"Predatory: suspiciously few records "

127 f"({total} < {_MIN_PREDATORY_TOTAL}); refusing to "

128 "overwrite existing data"

129 )

130

131 output = data_dir / self.filename

132 tmp = data_dir / f"{self.filename}.tmp"

133 with open(tmp, "w", encoding="utf-8") as f:

134 json.dump(payload, f)

135 tmp.rename(output)

136

137 logger.info(

138 f"Predatory: saved {len(publishers)} publishers + "

139 f"{len(journals)} journals + {len(hijacked)} hijacked"

140 )

141 return total

Coverage for src/local_deep_research/journal_quality/data_sources/predatory.py: 89%

57 statements