Coverage for src/local_deep_research/journal_quality/scoring.py: 96%
60 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Pure-function quality scoring helpers.
3These are stateless and called by both the build phase
4(`db.build_db()` populating the `quality` column) and the runtime
5filter (when scoring an institution-only fallback).
7Kept in a small module of their own so the build phase doesn't have
8to import the read-only DB accessor just to get at the score
9thresholds.
10"""
12from __future__ import annotations
14import unicodedata
15from typing import Optional
17from ..constants import (
18 CONFERENCE_QUALITY_DEFAULT,
19 DOAJ_QUALITY_NO_SEAL,
20 DOAJ_QUALITY_WITH_SEAL,
21 INSTITUTION_HINDEX_HIGH,
22 INSTITUTION_HINDEX_TOP,
23 INSTITUTION_QUALITY_DEFAULT,
24 INSTITUTION_QUALITY_HIGH,
25 INSTITUTION_QUALITY_TOP,
26 JOURNAL_HINDEX_ACCEPTABLE,
27 JOURNAL_HINDEX_ELITE,
28 JOURNAL_HINDEX_GOOD,
29 JOURNAL_HINDEX_STRONG,
30 JOURNAL_HINDEX_VERY_GOOD,
31 JOURNAL_QUALITY_ACCEPTABLE,
32 JOURNAL_QUALITY_DEFAULT,
33 JOURNAL_QUALITY_ELITE,
34 JOURNAL_QUALITY_GOOD,
35 JOURNAL_QUALITY_PREDATORY,
36 JOURNAL_QUALITY_STRONG,
37 REPOSITORY_QUALITY_DEFAULT,
38 JOURNAL_QUALITY_VERY_GOOD,
39)
42def normalize_name(name: str) -> str:
43 """NFKC + lowercase + strip — used for consistent name matching.
45 Mirrors the previous `_normalize` helper that lived in both
46 `journal_reference_db.py` and `journal_data_manager.py`. Single
47 home now so the build phase and the runtime accessor agree.
48 """
49 return unicodedata.normalize("NFKC", name).lower().strip()
52def derive_quality_score(
53 *,
54 h_index: Optional[int] = None,
55 quartile: Optional[str] = None,
56 is_in_doaj: bool = False,
57 has_doaj_seal: bool = False,
58 is_predatory: bool = False,
59 source_type: Optional[str] = None,
60) -> Optional[int]:
61 """Derive a 1–10 quality score from bibliometric data.
63 Inputs (in order of preference):
64 - ``quartile``: SJR-style Q1/Q2/Q3/Q4. Strongest single signal — this
65 is what librarians and reviewers use when evaluating journals, so
66 we honour it directly and only use h-index as a tiebreaker.
67 - ``h_index``: used standalone when no quartile is available.
68 - ``is_in_doaj`` / ``has_doaj_seal``: weakest fall-through signal.
70 H-index thresholds calibrated from real data:
71 - Nature h-index: 1,442
72 - PLOS ONE h-index: 467
73 - Only 3 journals globally have h-index > 1,000
75 Note: h-index has field-dependent bias (mathematics journals have
76 naturally lower h-index than biomedical journals). These thresholds
77 are general-purpose; field-specific normalization is not yet
78 implemented.
80 Returns:
81 Score 1–10, or `None` if there is not enough signal.
82 """
83 if is_predatory and not is_in_doaj:
84 return JOURNAL_QUALITY_PREDATORY # Auto-remove threshold
86 # Preprint repositories (arXiv, bioRxiv, SSRN, PsyArXiv, ...) are
87 # not peer-reviewed. Their h-index reflects citation accumulation
88 # across the thousands of papers they aggregate — not venue rigor.
89 # Cap them at the ACCEPTABLE tier so Q-tier semantics remain
90 # meaningful. The filter's Tier 3.5 institution-salvage path can
91 # lift this via author affiliations when appropriate.
92 #
93 # NOTE: only ``"repository"`` is capped here. ``"conference"`` gets
94 # its own flat score below via the ``source_type == "conference"``
95 # branch. Other OpenAlex source types — ``"book series"`` (Springer
96 # Lecture Notes etc.) and ``"ebook platform"`` (Elsevier
97 # ScienceDirect, Springer Link) — CAN be peer-reviewed, so we
98 # intentionally let h-index scoring apply for them. Reviewed in
99 # the PR #3081 audit; not a gap.
100 if source_type == "repository":
101 return REPOSITORY_QUALITY_DEFAULT
103 # Quartile takes precedence — it is the canonical librarian signal.
104 # We still let a high h-index bump a Q1 to "elite" so Nature stays
105 # distinguishable from a typical Q1. DOAJ Seal floor also applies
106 # orthogonally via max() so it cannot be silently discarded.
107 if quartile:
108 q = quartile.upper().strip()
109 q_score: Optional[int] = None
110 if q == "Q1":
111 if h_index and h_index > JOURNAL_HINDEX_ELITE:
112 q_score = JOURNAL_QUALITY_ELITE
113 else:
114 q_score = JOURNAL_QUALITY_STRONG
115 elif q == "Q2":
116 q_score = JOURNAL_QUALITY_VERY_GOOD
117 elif q == "Q3":
118 q_score = JOURNAL_QUALITY_GOOD
119 elif q == "Q4":
120 q_score = JOURNAL_QUALITY_ACCEPTABLE
122 if q_score is not None:
123 if has_doaj_seal: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 return max(q_score, DOAJ_QUALITY_WITH_SEAL)
125 if is_in_doaj:
126 return max(q_score, DOAJ_QUALITY_NO_SEAL)
127 return q_score
129 # h_index=0 means newly indexed, not meaningful. Negative values
130 # would be a data error — treat as no signal rather than returning
131 # DEFAULT which is ambiguous.
132 if h_index and h_index > 0:
133 if h_index > JOURNAL_HINDEX_ELITE:
134 h_score = JOURNAL_QUALITY_ELITE # Nature/Science/NEJM
135 elif h_index > JOURNAL_HINDEX_STRONG:
136 h_score = JOURNAL_QUALITY_STRONG
137 elif h_index > JOURNAL_HINDEX_VERY_GOOD:
138 h_score = JOURNAL_QUALITY_VERY_GOOD
139 elif h_index > JOURNAL_HINDEX_GOOD:
140 h_score = JOURNAL_QUALITY_GOOD
141 elif h_index > JOURNAL_HINDEX_ACCEPTABLE:
142 h_score = JOURNAL_QUALITY_ACCEPTABLE
143 else:
144 h_score = JOURNAL_QUALITY_DEFAULT
146 # DOAJ Seal is an orthogonal quality signal (best OA practices).
147 # A journal with moderate h-index but DOAJ Seal should get at
148 # least the Seal floor score. Use max() so the signals reinforce
149 # rather than conflict.
150 if has_doaj_seal:
151 return max(h_score, DOAJ_QUALITY_WITH_SEAL)
152 if is_in_doaj: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 return max(h_score, DOAJ_QUALITY_NO_SEAL)
154 return h_score
156 if is_in_doaj:
157 # DOAJ Seal = top ~10% of DOAJ journals (best OA practices)
158 return DOAJ_QUALITY_WITH_SEAL if has_doaj_seal else DOAJ_QUALITY_NO_SEAL
160 if source_type == "conference":
161 return CONFERENCE_QUALITY_DEFAULT # Neutral — in CS, top conferences are Q1-equivalent
163 return None # Insufficient data
166def institution_score_from_h_index(h_index: Optional[int]) -> Optional[int]:
167 """Derive a quality score from an institution's h-index.
169 Capped at 6 — institution alone never beats a real venue match.
170 Used by the Tier 3.5 affiliation salvage path in the filter.
171 """
172 if h_index is None:
173 return None
174 if h_index > INSTITUTION_HINDEX_TOP:
175 return INSTITUTION_QUALITY_TOP # Top-tier research universities
176 if h_index > INSTITUTION_HINDEX_HIGH:
177 return INSTITUTION_QUALITY_HIGH
178 return INSTITUTION_QUALITY_DEFAULT