Coverage for src/local_deep_research/database/models/citation.py: 94%
32 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Deduplicated academic paper metadata and appearance tracking.
4``Paper`` stores bibliographic data once per unique paper (deduped by
5DOI / arXiv ID / PMID). ``PaperAppearance`` links papers to the
6research resources that found them (many-to-one: many resources can
7reference the same paper).
9Journal quality is NEVER stored here — always derived at query time
10via ``journal_id`` → ``journals`` table → reference DB.
11"""
13from sqlalchemy import (
14 JSON,
15 Column,
16 ForeignKey,
17 Index,
18 Integer,
19 String,
20)
21from sqlalchemy.orm import relationship
22from sqlalchemy_utc import UtcDateTime, utcnow
24from .base import Base
27class Paper(Base):
28 """A unique academic paper, deduplicated by DOI/arXiv ID/PMID.
30 Created the first time a paper is encountered in any research
31 session. If the same paper is found again (same DOI, or same
32 arXiv ID when DOI is absent), the existing row is reused and
33 any missing identifiers are merged in.
35 Schema is minimal by design: only the columns used for dedup
36 lookups (``doi``, ``arxiv_id``, ``pmid``), dashboard joins
37 (``journal_id``, ``container_title``), and publication year for
38 indexed filtering are real columns. Everything else (authors,
39 volume, CSL-JSON, ...) goes into the ``paper_metadata`` JSON blob,
40 matching the hybrid relational-JSON pattern used by OpenAlex and
41 Crossref.
43 ``container_title`` is the cleaned journal name the filter used to
44 score the paper (post-regex-clean + abbreviation-expand + optional
45 LLM-relabel). Always populated when the filter scored the journal.
46 Indexed so the dashboard can GROUP BY it and batch-enrich from the
47 shared read-only reference DB.
49 No per-Paper quality column by design: a frozen snapshot would go
50 stale if a journal is re-scored (new LLM model, bug fix, manual
51 override). Instead, the dashboard resolves current quality live —
52 Tier 4 via ``journals.quality`` keyed by NFKC-normalized
53 ``container_title``, Tier 1-3 via the bundled reference DB.
54 Don't re-introduce a ``journal_quality`` column here.
55 """
57 __tablename__ = "papers"
59 id = Column(Integer, primary_key=True, autoincrement=True)
61 # Academic identifiers — used as waterfall dedup keys.
62 # UNIQUE constraints prevent concurrent writers from creating
63 # duplicate rows for the same paper. SQLite allows multiple NULL
64 # values in a UNIQUE column (standard SQL behavior), so papers
65 # without identifiers are still permitted — they just can't be
66 # deduplicated via these keys. `unique=True` already creates a
67 # backing index, so no separate `index=True` or explicit Index
68 # entry is needed for these columns.
69 #
70 # KNOWN-DEFERRED: String(255) is sufficient for the vast majority
71 # of real-world DOIs (CrossRef recommends <= 200 chars). A
72 # pathological dataset DOI approaching 2000 chars would fail on
73 # insert rather than silently corrupt. Tracked as a post-merge
74 # follow-up to bump to String(512) with a truncation guard in
75 # _extract_doi.
76 doi = Column(String(255), nullable=True, unique=True)
77 arxiv_id = Column(String(100), nullable=True, unique=True)
78 pmid = Column(String(50), nullable=True, unique=True)
80 # Venue link for quality lookups (quality derived at query time).
81 # Named index declared in __table_args__ below.
82 journal_id = Column(
83 Integer,
84 ForeignKey("journals.id", ondelete="SET NULL"),
85 nullable=True,
86 )
88 # Cleaned journal name (post-regex-clean / abbreviation-expand /
89 # optional LLM-relabel) that keyed the filter's score. Dashboard
90 # GROUP BY key. Named index declared in __table_args__ below.
91 container_title = Column(String(500), nullable=True)
93 # NOTE: no ``journal_quality`` column — see class docstring above.
94 # Quality is resolved live at render time from journals.quality
95 # (Tier 4) or the bundled reference DB (Tier 1-3) so that a
96 # re-scored journal propagates to old papers automatically.
98 # Publication year. Promoted out of the metadata JSON blob into a
99 # first-class integer column so the dashboard can filter/group/sort
100 # by year without paying for json_extract on every row. Named index
101 # declared in __table_args__ below. Always written alongside the
102 # JSON copy in paper_metadata so existing readers keep working;
103 # this column is a denormalized index surface.
104 year = Column(Integer, nullable=True)
106 # Bibliographic fields (authors, volume, pages, container_title,
107 # publisher, item_type, pmcid, csl_json, ...) stored as a single
108 # JSON blob. Note: ``year`` is ALSO duplicated here as the CSL-JSON
109 # source of truth; the first-class ``year`` column above is a
110 # denormalized copy for indexed queries. Python attribute is
111 # ``paper_metadata`` to avoid SQLAlchemy's reserved ``metadata``
112 # attribute on declarative Base; the underlying column is still
113 # named ``metadata`` in SQL. Mirrors the ResearchResource pattern
114 # (``resource_metadata = Column("metadata", JSON)``).
115 paper_metadata = Column("metadata", JSON, nullable=True)
117 # Timestamps
118 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
119 updated_at = Column(
120 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
121 )
123 # Relationships
124 appearances = relationship(
125 "PaperAppearance",
126 back_populates="paper",
127 cascade="all, delete-orphan",
128 )
129 journal = relationship("Journal", backref="papers")
131 __table_args__ = (
132 Index("idx_papers_journal", "journal_id"),
133 Index("idx_papers_container_title", "container_title"),
134 Index("idx_papers_year", "year"),
135 )
137 def __repr__(self):
138 return (
139 f"<Paper("
140 f"id={self.id}, "
141 f"doi={self.doi!r}, "
142 f"journal_id={self.journal_id})>"
143 )
146class PaperAppearance(Base):
147 """Links a Paper to the ResearchResource that found it.
149 Each search result (ResearchResource) can reference at most one
150 paper. The same paper can appear across many research sessions,
151 each creating a separate PaperAppearance row — but the paper's
152 metadata is stored only once in the ``papers`` table.
153 """
155 __tablename__ = "paper_appearances"
157 id = Column(Integer, primary_key=True, autoincrement=True)
158 paper_id = Column(
159 Integer,
160 ForeignKey("papers.id", ondelete="CASCADE"),
161 nullable=False,
162 index=True,
163 )
164 # unique=True enforces: each ResearchResource row appears in at
165 # most one PaperAppearance. This is intentional — a resource
166 # represents ONE search-result row for ONE paper. If a research
167 # session retries and creates a new ResearchResource, it gets a
168 # new resource_id and can link to a (possibly different) paper
169 # without conflict. Removing this UNIQUE would allow the same
170 # resource to claim multiple papers, which is nonsensical in the
171 # domain model. unique=True already creates a backing unique
172 # index — no separate index=True or explicit Index() entry.
173 resource_id = Column(
174 Integer,
175 ForeignKey("research_resources.id", ondelete="CASCADE"),
176 nullable=False,
177 unique=True,
178 )
179 # Which search engine found this paper for this resource.
180 # KNOWN-DEFERRED: currently written by normalize_citation output
181 # but not read by any route or service. Retained for future
182 # per-engine analytics (e.g., "how many predatory journals via
183 # arxiv vs openalex"). Do not remove without confirming no planned
184 # consumer exists.
185 source_engine = Column(String(50), nullable=True)
186 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
188 # Relationships
189 paper = relationship("Paper", back_populates="appearances")
190 resource = relationship(
191 "ResearchResource", back_populates="paper_appearance"
192 )
194 def __repr__(self):
195 return (
196 f"<PaperAppearance("
197 f"paper_id={self.paper_id}, "
198 f"resource_id={self.resource_id})>"
199 )