Coverage for src/local_deep_research/journal_quality/models.py: 100%
50 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""SQLAlchemy declarative models for the compiled journal-quality DB.
3These models map onto the read-only `journal_quality.db` SQLite file
4that is built by `db.build_db()` from the gzipped JSON snapshots in
5the user data directory. They are deliberately flat — no relationships
6between tables — so that statement caching keeps per-query ORM
7overhead in the ~100–300 µs range for the filter hot path.
9Three tables:
11- `sources` — academic venues (journals + conferences) with
12 h-index, impact factor, DOAJ flags, predatory flags.
13 Compiled from OpenAlex sources + DOAJ + Stop Predatory
14 Journals + CORE conferences.
15- `institutions`— OpenAlex institutions with h-index and ROR ID,
16 used by the Tier 3.5 affiliation-based scoring path.
17- `abbreviations`— JabRef journal-name abbreviation expansions used by
18 Tier 2 name normalization (e.g. "Phys. Rev. Lett."
19 → "Physical Review Letters").
21The DB is rebuilt from scratch after every download, so schema changes
22ride along automatically — no Alembic, no migration plumbing.
23"""
25from __future__ import annotations
27from sqlalchemy import Boolean, CheckConstraint, Float, Integer, String
28from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
31class JournalQualityBase(DeclarativeBase):
32 """Standalone declarative base for the journal-quality DB.
34 Intentionally NOT shared with other declarative bases in the
35 codebase (`database/models/base.py`, `library/download_management/
36 models/__init__.py`) — the journal-quality data is shared, public,
37 read-only, and rebuilt monthly, while the other bases manage
38 per-user encrypted state with completely different lifecycle.
39 Sharing a base would tempt cross-table foreign keys we don't want.
40 """
43class Source(JournalQualityBase):
44 """An academic venue (journal or conference)."""
46 __tablename__ = "sources"
47 __table_args__ = (
48 # Defense-in-depth for ``score_source``. The API layer already
49 # rejects out-of-allowlist values at ``/api/journals`` (see
50 # metrics_routes.py _ALLOWED_SCORE_SOURCES), but that only
51 # covers the read path. A DB-level CHECK catches any future
52 # writer — a refactor of _populate_sources, a one-off import
53 # script, a hand-edited manifest — that accidentally inserts
54 # an invalid value. Build fails fast instead of corrupting the
55 # dashboard silently. The allowlist matches what
56 # _populate_sources actually emits in db.py (openalex + doaj).
57 CheckConstraint(
58 "score_source IN ('openalex', 'doaj')",
59 name="ck_sources_score_source",
60 ),
61 )
63 id: Mapped[int] = mapped_column(Integer, primary_key=True)
64 name: Mapped[str] = mapped_column(String, nullable=False)
65 # Not unique: a journal can have multiple ISSN variants (print +
66 # electronic), and DOAJ-only entries may collide with OpenAlex
67 # entries on name. Dedup is done at build time on (name_lower, issn).
68 name_lower: Mapped[str] = mapped_column(String, nullable=False, index=True)
69 issn: Mapped[str | None] = mapped_column(String, index=True)
70 openalex_source_id: Mapped[str | None] = mapped_column(String, index=True)
71 source_type: Mapped[str | None] = mapped_column(String)
72 publisher: Mapped[str | None] = mapped_column(String)
73 h_index: Mapped[int | None] = mapped_column(Integer)
74 impact_factor: Mapped[float | None] = mapped_column(Float)
75 cited_by_count: Mapped[int | None] = mapped_column(Integer)
76 # Display-only quartile derived at build time from cited_by_count
77 # percentile within source_type. NULL when cited_by_count is missing.
78 # Does NOT feed into the `quality` column — h-index remains canonical.
79 # KNOWN-DEFERRED: index=True is unused — no query filters, sorts, or
80 # groups by quartile. Kept because the reference DB is rebuilt from
81 # JSON on schema-version bump (see _ensure_engine in db.py), so
82 # removing the index requires only a schema version bump, not a
83 # per-user migration. Post-merge cleanup.
84 quartile: Mapped[str | None] = mapped_column(String(2), index=True)
85 quality: Mapped[int | None] = mapped_column(Integer, index=True)
86 is_in_doaj: Mapped[bool] = mapped_column(Boolean, default=False)
87 has_doaj_seal: Mapped[bool] = mapped_column(Boolean, default=False)
88 is_predatory: Mapped[bool] = mapped_column(
89 Boolean, default=False, index=True
90 )
91 predatory_source: Mapped[str | None] = mapped_column(String)
92 # Indexed: the dashboard's /api/journals endpoint filters by
93 # score_source (allowlist {openalex, doaj, llm}) and without an
94 # index this becomes a full scan of the ~217K-row table. With the
95 # index, the filter is a single sub-millisecond lookup.
96 score_source: Mapped[str] = mapped_column(
97 String, default="openalex", index=True
98 )
101class Institution(JournalQualityBase):
102 """An OpenAlex research institution.
104 Used by the Tier 3.5 affiliation-based scoring path: when a paper
105 has no recognizable venue, the filter falls back to the author
106 institutions and takes the highest h-index across them, capped at 6.
107 """
109 __tablename__ = "institutions"
111 openalex_id: Mapped[str] = mapped_column(String, primary_key=True)
112 name: Mapped[str] = mapped_column(String, nullable=False)
113 name_lower: Mapped[str] = mapped_column(String, index=True)
114 ror_id: Mapped[str | None] = mapped_column(String, index=True)
115 country: Mapped[str | None] = mapped_column(String)
116 type: Mapped[str | None] = mapped_column(String)
117 h_index: Mapped[int | None] = mapped_column(Integer, index=True)
118 # KNOWN-DEFERRED: OpenAlex does not publish impact_factor for
119 # institutions (only for sources/journals). This column is NULL
120 # for essentially all 200K+ institution rows and is not read in
121 # any scoring path. Retained for schema symmetry with
122 # Source.impact_factor and to allow future enrichment. Post-merge
123 # candidate for removal.
124 impact_factor: Mapped[float | None] = mapped_column(Float)
125 works_count: Mapped[int | None] = mapped_column(Integer)
126 cited_by_count: Mapped[int | None] = mapped_column(Integer)
129class PredatoryJournal(JournalQualityBase):
130 """A journal name on the Stop Predatory Journals list.
132 Stored as a separate table (not just an `is_predatory` flag on
133 `Source`) so the runtime check works for arbitrary input names that
134 aren't in OpenAlex's source list. The dict-based predecessor used
135 a Python set for the same reason.
136 """
138 __tablename__ = "predatory_journals"
140 name_lower: Mapped[str] = mapped_column(String, primary_key=True)
143class PredatoryPublisher(JournalQualityBase):
144 """A publisher name on the Stop Predatory Journals list.
146 `is_long` flags entries with name length >= 10 chars; those are
147 eligible for substring matching to catch renamed variants like
148 "OMICS Publishing" matching "OMICS Publishing Group Ltd." Short
149 names are exact-match only to avoid false positives.
150 """
152 __tablename__ = "predatory_publishers"
154 name_lower: Mapped[str] = mapped_column(String, primary_key=True)
155 is_long: Mapped[bool] = mapped_column(Boolean, default=False, index=True)
158class PredatoryHijacked(JournalQualityBase):
159 """A hijacked journal name (clone of a legitimate journal)."""
161 __tablename__ = "predatory_hijacked"
163 name_lower: Mapped[str] = mapped_column(String, primary_key=True)
166class Abbreviation(JournalQualityBase):
167 """A JabRef journal-name abbreviation → full-name expansion.
169 Looked up case-insensitively when the filter sees an abbreviated
170 journal_ref like 'Phys. Rev. Lett.' that needs to be expanded to
171 'Physical Review Letters' before the OpenAlex name lookup.
172 """
174 __tablename__ = "abbreviations"
176 abbrev_lower: Mapped[str] = mapped_column(String, primary_key=True)
177 full_name: Mapped[str] = mapped_column(String, nullable=False)