Coverage for src/local_deep_research/journal_quality/models.py: 100%

50 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""SQLAlchemy declarative models for the compiled journal-quality DB. 

2 

3These models map onto the read-only `journal_quality.db` SQLite file 

4that is built by `db.build_db()` from the gzipped JSON snapshots in 

5the user data directory. They are deliberately flat — no relationships 

6between tables — so that statement caching keeps per-query ORM 

7overhead in the ~100–300 µs range for the filter hot path. 

8 

9Three tables: 

10 

11- `sources` — academic venues (journals + conferences) with 

12 h-index, impact factor, DOAJ flags, predatory flags. 

13 Compiled from OpenAlex sources + DOAJ + Stop Predatory 

14 Journals + CORE conferences. 

15- `institutions`— OpenAlex institutions with h-index and ROR ID, 

16 used by the Tier 3.5 affiliation-based scoring path. 

17- `abbreviations`— JabRef journal-name abbreviation expansions used by 

18 Tier 2 name normalization (e.g. "Phys. Rev. Lett." 

19 → "Physical Review Letters"). 

20 

21The DB is rebuilt from scratch after every download, so schema changes 

22ride along automatically — no Alembic, no migration plumbing. 

23""" 

24 

25from __future__ import annotations 

26 

27from sqlalchemy import Boolean, CheckConstraint, Float, Integer, String 

28from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column 

29 

30 

31class JournalQualityBase(DeclarativeBase): 

32 """Standalone declarative base for the journal-quality DB. 

33 

34 Intentionally NOT shared with other declarative bases in the 

35 codebase (`database/models/base.py`, `library/download_management/ 

36 models/__init__.py`) — the journal-quality data is shared, public, 

37 read-only, and rebuilt monthly, while the other bases manage 

38 per-user encrypted state with completely different lifecycle. 

39 Sharing a base would tempt cross-table foreign keys we don't want. 

40 """ 

41 

42 

43class Source(JournalQualityBase): 

44 """An academic venue (journal or conference).""" 

45 

46 __tablename__ = "sources" 

47 __table_args__ = ( 

48 # Defense-in-depth for ``score_source``. The API layer already 

49 # rejects out-of-allowlist values at ``/api/journals`` (see 

50 # metrics_routes.py _ALLOWED_SCORE_SOURCES), but that only 

51 # covers the read path. A DB-level CHECK catches any future 

52 # writer — a refactor of _populate_sources, a one-off import 

53 # script, a hand-edited manifest — that accidentally inserts 

54 # an invalid value. Build fails fast instead of corrupting the 

55 # dashboard silently. The allowlist matches what 

56 # _populate_sources actually emits in db.py (openalex + doaj). 

57 CheckConstraint( 

58 "score_source IN ('openalex', 'doaj')", 

59 name="ck_sources_score_source", 

60 ), 

61 ) 

62 

63 id: Mapped[int] = mapped_column(Integer, primary_key=True) 

64 name: Mapped[str] = mapped_column(String, nullable=False) 

65 # Not unique: a journal can have multiple ISSN variants (print + 

66 # electronic), and DOAJ-only entries may collide with OpenAlex 

67 # entries on name. Dedup is done at build time on (name_lower, issn). 

68 name_lower: Mapped[str] = mapped_column(String, nullable=False, index=True) 

69 issn: Mapped[str | None] = mapped_column(String, index=True) 

70 openalex_source_id: Mapped[str | None] = mapped_column(String, index=True) 

71 source_type: Mapped[str | None] = mapped_column(String) 

72 publisher: Mapped[str | None] = mapped_column(String) 

73 h_index: Mapped[int | None] = mapped_column(Integer) 

74 impact_factor: Mapped[float | None] = mapped_column(Float) 

75 cited_by_count: Mapped[int | None] = mapped_column(Integer) 

76 # Display-only quartile derived at build time from cited_by_count 

77 # percentile within source_type. NULL when cited_by_count is missing. 

78 # Does NOT feed into the `quality` column — h-index remains canonical. 

79 # KNOWN-DEFERRED: index=True is unused — no query filters, sorts, or 

80 # groups by quartile. Kept because the reference DB is rebuilt from 

81 # JSON on schema-version bump (see _ensure_engine in db.py), so 

82 # removing the index requires only a schema version bump, not a 

83 # per-user migration. Post-merge cleanup. 

84 quartile: Mapped[str | None] = mapped_column(String(2), index=True) 

85 quality: Mapped[int | None] = mapped_column(Integer, index=True) 

86 is_in_doaj: Mapped[bool] = mapped_column(Boolean, default=False) 

87 has_doaj_seal: Mapped[bool] = mapped_column(Boolean, default=False) 

88 is_predatory: Mapped[bool] = mapped_column( 

89 Boolean, default=False, index=True 

90 ) 

91 predatory_source: Mapped[str | None] = mapped_column(String) 

92 # Indexed: the dashboard's /api/journals endpoint filters by 

93 # score_source (allowlist {openalex, doaj, llm}) and without an 

94 # index this becomes a full scan of the ~217K-row table. With the 

95 # index, the filter is a single sub-millisecond lookup. 

96 score_source: Mapped[str] = mapped_column( 

97 String, default="openalex", index=True 

98 ) 

99 

100 

101class Institution(JournalQualityBase): 

102 """An OpenAlex research institution. 

103 

104 Used by the Tier 3.5 affiliation-based scoring path: when a paper 

105 has no recognizable venue, the filter falls back to the author 

106 institutions and takes the highest h-index across them, capped at 6. 

107 """ 

108 

109 __tablename__ = "institutions" 

110 

111 openalex_id: Mapped[str] = mapped_column(String, primary_key=True) 

112 name: Mapped[str] = mapped_column(String, nullable=False) 

113 name_lower: Mapped[str] = mapped_column(String, index=True) 

114 ror_id: Mapped[str | None] = mapped_column(String, index=True) 

115 country: Mapped[str | None] = mapped_column(String) 

116 type: Mapped[str | None] = mapped_column(String) 

117 h_index: Mapped[int | None] = mapped_column(Integer, index=True) 

118 # KNOWN-DEFERRED: OpenAlex does not publish impact_factor for 

119 # institutions (only for sources/journals). This column is NULL 

120 # for essentially all 200K+ institution rows and is not read in 

121 # any scoring path. Retained for schema symmetry with 

122 # Source.impact_factor and to allow future enrichment. Post-merge 

123 # candidate for removal. 

124 impact_factor: Mapped[float | None] = mapped_column(Float) 

125 works_count: Mapped[int | None] = mapped_column(Integer) 

126 cited_by_count: Mapped[int | None] = mapped_column(Integer) 

127 

128 

129class PredatoryJournal(JournalQualityBase): 

130 """A journal name on the Stop Predatory Journals list. 

131 

132 Stored as a separate table (not just an `is_predatory` flag on 

133 `Source`) so the runtime check works for arbitrary input names that 

134 aren't in OpenAlex's source list. The dict-based predecessor used 

135 a Python set for the same reason. 

136 """ 

137 

138 __tablename__ = "predatory_journals" 

139 

140 name_lower: Mapped[str] = mapped_column(String, primary_key=True) 

141 

142 

143class PredatoryPublisher(JournalQualityBase): 

144 """A publisher name on the Stop Predatory Journals list. 

145 

146 `is_long` flags entries with name length >= 10 chars; those are 

147 eligible for substring matching to catch renamed variants like 

148 "OMICS Publishing" matching "OMICS Publishing Group Ltd." Short 

149 names are exact-match only to avoid false positives. 

150 """ 

151 

152 __tablename__ = "predatory_publishers" 

153 

154 name_lower: Mapped[str] = mapped_column(String, primary_key=True) 

155 is_long: Mapped[bool] = mapped_column(Boolean, default=False, index=True) 

156 

157 

158class PredatoryHijacked(JournalQualityBase): 

159 """A hijacked journal name (clone of a legitimate journal).""" 

160 

161 __tablename__ = "predatory_hijacked" 

162 

163 name_lower: Mapped[str] = mapped_column(String, primary_key=True) 

164 

165 

166class Abbreviation(JournalQualityBase): 

167 """A JabRef journal-name abbreviation → full-name expansion. 

168 

169 Looked up case-insensitively when the filter sees an abbreviated 

170 journal_ref like 'Phys. Rev. Lett.' that needs to be expanded to 

171 'Physical Review Letters' before the OpenAlex name lookup. 

172 """ 

173 

174 __tablename__ = "abbreviations" 

175 

176 abbrev_lower: Mapped[str] = mapped_column(String, primary_key=True) 

177 full_name: Mapped[str] = mapped_column(String, nullable=False)