Coverage for src/local_deep_research/journal_quality/scoring.py: 96%

60 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Pure-function quality scoring helpers. 

2 

3These are stateless and called by both the build phase 

4(`db.build_db()` populating the `quality` column) and the runtime 

5filter (when scoring an institution-only fallback). 

6 

7Kept in a small module of their own so the build phase doesn't have 

8to import the read-only DB accessor just to get at the score 

9thresholds. 

10""" 

11 

12from __future__ import annotations 

13 

14import unicodedata 

15from typing import Optional 

16 

17from ..constants import ( 

18 CONFERENCE_QUALITY_DEFAULT, 

19 DOAJ_QUALITY_NO_SEAL, 

20 DOAJ_QUALITY_WITH_SEAL, 

21 INSTITUTION_HINDEX_HIGH, 

22 INSTITUTION_HINDEX_TOP, 

23 INSTITUTION_QUALITY_DEFAULT, 

24 INSTITUTION_QUALITY_HIGH, 

25 INSTITUTION_QUALITY_TOP, 

26 JOURNAL_HINDEX_ACCEPTABLE, 

27 JOURNAL_HINDEX_ELITE, 

28 JOURNAL_HINDEX_GOOD, 

29 JOURNAL_HINDEX_STRONG, 

30 JOURNAL_HINDEX_VERY_GOOD, 

31 JOURNAL_QUALITY_ACCEPTABLE, 

32 JOURNAL_QUALITY_DEFAULT, 

33 JOURNAL_QUALITY_ELITE, 

34 JOURNAL_QUALITY_GOOD, 

35 JOURNAL_QUALITY_PREDATORY, 

36 JOURNAL_QUALITY_STRONG, 

37 REPOSITORY_QUALITY_DEFAULT, 

38 JOURNAL_QUALITY_VERY_GOOD, 

39) 

40 

41 

42def normalize_name(name: str) -> str: 

43 """NFKC + lowercase + strip — used for consistent name matching. 

44 

45 Mirrors the previous `_normalize` helper that lived in both 

46 `journal_reference_db.py` and `journal_data_manager.py`. Single 

47 home now so the build phase and the runtime accessor agree. 

48 """ 

49 return unicodedata.normalize("NFKC", name).lower().strip() 

50 

51 

52def derive_quality_score( 

53 *, 

54 h_index: Optional[int] = None, 

55 quartile: Optional[str] = None, 

56 is_in_doaj: bool = False, 

57 has_doaj_seal: bool = False, 

58 is_predatory: bool = False, 

59 source_type: Optional[str] = None, 

60) -> Optional[int]: 

61 """Derive a 1–10 quality score from bibliometric data. 

62 

63 Inputs (in order of preference): 

64 - ``quartile``: SJR-style Q1/Q2/Q3/Q4. Strongest single signal — this 

65 is what librarians and reviewers use when evaluating journals, so 

66 we honour it directly and only use h-index as a tiebreaker. 

67 - ``h_index``: used standalone when no quartile is available. 

68 - ``is_in_doaj`` / ``has_doaj_seal``: weakest fall-through signal. 

69 

70 H-index thresholds calibrated from real data: 

71 - Nature h-index: 1,442 

72 - PLOS ONE h-index: 467 

73 - Only 3 journals globally have h-index > 1,000 

74 

75 Note: h-index has field-dependent bias (mathematics journals have 

76 naturally lower h-index than biomedical journals). These thresholds 

77 are general-purpose; field-specific normalization is not yet 

78 implemented. 

79 

80 Returns: 

81 Score 1–10, or `None` if there is not enough signal. 

82 """ 

83 if is_predatory and not is_in_doaj: 

84 return JOURNAL_QUALITY_PREDATORY # Auto-remove threshold 

85 

86 # Preprint repositories (arXiv, bioRxiv, SSRN, PsyArXiv, ...) are 

87 # not peer-reviewed. Their h-index reflects citation accumulation 

88 # across the thousands of papers they aggregate — not venue rigor. 

89 # Cap them at the ACCEPTABLE tier so Q-tier semantics remain 

90 # meaningful. The filter's Tier 3.5 institution-salvage path can 

91 # lift this via author affiliations when appropriate. 

92 # 

93 # NOTE: only ``"repository"`` is capped here. ``"conference"`` gets 

94 # its own flat score below via the ``source_type == "conference"`` 

95 # branch. Other OpenAlex source types — ``"book series"`` (Springer 

96 # Lecture Notes etc.) and ``"ebook platform"`` (Elsevier 

97 # ScienceDirect, Springer Link) — CAN be peer-reviewed, so we 

98 # intentionally let h-index scoring apply for them. Reviewed in 

99 # the PR #3081 audit; not a gap. 

100 if source_type == "repository": 

101 return REPOSITORY_QUALITY_DEFAULT 

102 

103 # Quartile takes precedence — it is the canonical librarian signal. 

104 # We still let a high h-index bump a Q1 to "elite" so Nature stays 

105 # distinguishable from a typical Q1. DOAJ Seal floor also applies 

106 # orthogonally via max() so it cannot be silently discarded. 

107 if quartile: 

108 q = quartile.upper().strip() 

109 q_score: Optional[int] = None 

110 if q == "Q1": 

111 if h_index and h_index > JOURNAL_HINDEX_ELITE: 

112 q_score = JOURNAL_QUALITY_ELITE 

113 else: 

114 q_score = JOURNAL_QUALITY_STRONG 

115 elif q == "Q2": 

116 q_score = JOURNAL_QUALITY_VERY_GOOD 

117 elif q == "Q3": 

118 q_score = JOURNAL_QUALITY_GOOD 

119 elif q == "Q4": 

120 q_score = JOURNAL_QUALITY_ACCEPTABLE 

121 

122 if q_score is not None: 

123 if has_doaj_seal: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 return max(q_score, DOAJ_QUALITY_WITH_SEAL) 

125 if is_in_doaj: 

126 return max(q_score, DOAJ_QUALITY_NO_SEAL) 

127 return q_score 

128 

129 # h_index=0 means newly indexed, not meaningful. Negative values 

130 # would be a data error — treat as no signal rather than returning 

131 # DEFAULT which is ambiguous. 

132 if h_index and h_index > 0: 

133 if h_index > JOURNAL_HINDEX_ELITE: 

134 h_score = JOURNAL_QUALITY_ELITE # Nature/Science/NEJM 

135 elif h_index > JOURNAL_HINDEX_STRONG: 

136 h_score = JOURNAL_QUALITY_STRONG 

137 elif h_index > JOURNAL_HINDEX_VERY_GOOD: 

138 h_score = JOURNAL_QUALITY_VERY_GOOD 

139 elif h_index > JOURNAL_HINDEX_GOOD: 

140 h_score = JOURNAL_QUALITY_GOOD 

141 elif h_index > JOURNAL_HINDEX_ACCEPTABLE: 

142 h_score = JOURNAL_QUALITY_ACCEPTABLE 

143 else: 

144 h_score = JOURNAL_QUALITY_DEFAULT 

145 

146 # DOAJ Seal is an orthogonal quality signal (best OA practices). 

147 # A journal with moderate h-index but DOAJ Seal should get at 

148 # least the Seal floor score. Use max() so the signals reinforce 

149 # rather than conflict. 

150 if has_doaj_seal: 

151 return max(h_score, DOAJ_QUALITY_WITH_SEAL) 

152 if is_in_doaj: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 return max(h_score, DOAJ_QUALITY_NO_SEAL) 

154 return h_score 

155 

156 if is_in_doaj: 

157 # DOAJ Seal = top ~10% of DOAJ journals (best OA practices) 

158 return DOAJ_QUALITY_WITH_SEAL if has_doaj_seal else DOAJ_QUALITY_NO_SEAL 

159 

160 if source_type == "conference": 

161 return CONFERENCE_QUALITY_DEFAULT # Neutral — in CS, top conferences are Q1-equivalent 

162 

163 return None # Insufficient data 

164 

165 

166def institution_score_from_h_index(h_index: Optional[int]) -> Optional[int]: 

167 """Derive a quality score from an institution's h-index. 

168 

169 Capped at 6 — institution alone never beats a real venue match. 

170 Used by the Tier 3.5 affiliation salvage path in the filter. 

171 """ 

172 if h_index is None: 

173 return None 

174 if h_index > INSTITUTION_HINDEX_TOP: 

175 return INSTITUTION_QUALITY_TOP # Top-tier research universities 

176 if h_index > INSTITUTION_HINDEX_HIGH: 

177 return INSTITUTION_QUALITY_HIGH 

178 return INSTITUTION_QUALITY_DEFAULT