Coverage for src/local_deep_research/database/models/citation.py: 94%

1"""

2Deduplicated academic paper metadata and appearance tracking.

4``Paper`` stores bibliographic data once per unique paper (deduped by

5DOI / arXiv ID / PMID). ``PaperAppearance`` links papers to the

6research resources that found them (many-to-one: many resources can

7reference the same paper).

9Journal quality is NEVER stored here — always derived at query time

10via ``journal_id`` → ``journals`` table → reference DB.

11"""

13from sqlalchemy import (

14 JSON,

15 Column,

16 ForeignKey,

17 Index,

18 Integer,

19 String,

20)

21from sqlalchemy.orm import relationship

22from sqlalchemy_utc import UtcDateTime, utcnow

24from .base import Base

27class Paper(Base):

28 """A unique academic paper, deduplicated by DOI/arXiv ID/PMID.

30 Created the first time a paper is encountered in any research

31 session. If the same paper is found again (same DOI, or same

32 arXiv ID when DOI is absent), the existing row is reused and

33 any missing identifiers are merged in.

35 Schema is minimal by design: only the columns used for dedup

36 lookups (``doi``, ``arxiv_id``, ``pmid``), dashboard joins

37 (``journal_id``, ``container_title``), and publication year for

38 indexed filtering are real columns. Everything else (authors,

39 volume, CSL-JSON, ...) goes into the ``paper_metadata`` JSON blob,

40 matching the hybrid relational-JSON pattern used by OpenAlex and

41 Crossref.

43 ``container_title`` is the cleaned journal name the filter used to

44 score the paper (post-regex-clean + abbreviation-expand + optional

45 LLM-relabel). Always populated when the filter scored the journal.

46 Indexed so the dashboard can GROUP BY it and batch-enrich from the

47 shared read-only reference DB.

49 No per-Paper quality column by design: a frozen snapshot would go

50 stale if a journal is re-scored (new LLM model, bug fix, manual

51 override). Instead, the dashboard resolves current quality live —

52 Tier 4 via ``journals.quality`` keyed by NFKC-normalized

53 ``container_title``, Tier 1-3 via the bundled reference DB.

54 Don't re-introduce a ``journal_quality`` column here.

55 """

57 __tablename__ = "papers"

59 id = Column(Integer, primary_key=True, autoincrement=True)

61 # Academic identifiers — used as waterfall dedup keys.

62 # UNIQUE constraints prevent concurrent writers from creating

63 # duplicate rows for the same paper. SQLite allows multiple NULL

64 # values in a UNIQUE column (standard SQL behavior), so papers

65 # without identifiers are still permitted — they just can't be

66 # deduplicated via these keys. `unique=True` already creates a

67 # backing index, so no separate `index=True` or explicit Index

68 # entry is needed for these columns.

69 #

70 # KNOWN-DEFERRED: String(255) is sufficient for the vast majority

71 # of real-world DOIs (CrossRef recommends <= 200 chars). A

72 # pathological dataset DOI approaching 2000 chars would fail on

73 # insert rather than silently corrupt. Tracked as a post-merge

74 # follow-up to bump to String(512) with a truncation guard in

75 # _extract_doi.

76 doi = Column(String(255), nullable=True, unique=True)

77 arxiv_id = Column(String(100), nullable=True, unique=True)

78 pmid = Column(String(50), nullable=True, unique=True)

80 # Venue link for quality lookups (quality derived at query time).

81 # Named index declared in __table_args__ below.

82 journal_id = Column(

83 Integer,

84 ForeignKey("journals.id", ondelete="SET NULL"),

85 nullable=True,

86 )

88 # Cleaned journal name (post-regex-clean / abbreviation-expand /

89 # optional LLM-relabel) that keyed the filter's score. Dashboard

90 # GROUP BY key. Named index declared in __table_args__ below.

91 container_title = Column(String(500), nullable=True)

93 # NOTE: no ``journal_quality`` column — see class docstring above.

94 # Quality is resolved live at render time from journals.quality

95 # (Tier 4) or the bundled reference DB (Tier 1-3) so that a

96 # re-scored journal propagates to old papers automatically.

98 # Publication year. Promoted out of the metadata JSON blob into a

99 # first-class integer column so the dashboard can filter/group/sort

100 # by year without paying for json_extract on every row. Named index

101 # declared in __table_args__ below. Always written alongside the

102 # JSON copy in paper_metadata so existing readers keep working;

103 # this column is a denormalized index surface.

104 year = Column(Integer, nullable=True)

105

106 # Bibliographic fields (authors, volume, pages, container_title,

107 # publisher, item_type, pmcid, csl_json, ...) stored as a single

108 # JSON blob. Note: ``year`` is ALSO duplicated here as the CSL-JSON

109 # source of truth; the first-class ``year`` column above is a

110 # denormalized copy for indexed queries. Python attribute is

111 # ``paper_metadata`` to avoid SQLAlchemy's reserved ``metadata``

112 # attribute on declarative Base; the underlying column is still

113 # named ``metadata`` in SQL. Mirrors the ResearchResource pattern

114 # (``resource_metadata = Column("metadata", JSON)``).

115 paper_metadata = Column("metadata", JSON, nullable=True)

116

117 # Timestamps

118 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

119 updated_at = Column(

120 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

121 )

122

123 # Relationships

124 appearances = relationship(

125 "PaperAppearance",

126 back_populates="paper",

127 cascade="all, delete-orphan",

128 )

129 journal = relationship("Journal", backref="papers")

130

131 __table_args__ = (

132 Index("idx_papers_journal", "journal_id"),

133 Index("idx_papers_container_title", "container_title"),

134 Index("idx_papers_year", "year"),

135 )

136

137 def __repr__(self):

138 return (

139 f"<Paper("

140 f"id={self.id}, "

141 f"doi={self.doi!r}, "

142 f"journal_id={self.journal_id})>"

143 )

144

145

146class PaperAppearance(Base):

147 """Links a Paper to the ResearchResource that found it.

148

149 Each search result (ResearchResource) can reference at most one

150 paper. The same paper can appear across many research sessions,

151 each creating a separate PaperAppearance row — but the paper's

152 metadata is stored only once in the ``papers`` table.

153 """

154

155 __tablename__ = "paper_appearances"

156

157 id = Column(Integer, primary_key=True, autoincrement=True)

158 paper_id = Column(

159 Integer,

160 ForeignKey("papers.id", ondelete="CASCADE"),

161 nullable=False,

162 index=True,

163 )

164 # unique=True enforces: each ResearchResource row appears in at

165 # most one PaperAppearance. This is intentional — a resource

166 # represents ONE search-result row for ONE paper. If a research

167 # session retries and creates a new ResearchResource, it gets a

168 # new resource_id and can link to a (possibly different) paper

169 # without conflict. Removing this UNIQUE would allow the same

170 # resource to claim multiple papers, which is nonsensical in the

171 # domain model. unique=True already creates a backing unique

172 # index — no separate index=True or explicit Index() entry.

173 resource_id = Column(

174 Integer,

175 ForeignKey("research_resources.id", ondelete="CASCADE"),

176 nullable=False,

177 unique=True,

178 )

179 # Which search engine found this paper for this resource.

180 # KNOWN-DEFERRED: currently written by normalize_citation output

181 # but not read by any route or service. Retained for future

182 # per-engine analytics (e.g., "how many predatory journals via

183 # arxiv vs openalex"). Do not remove without confirming no planned

184 # consumer exists.

185 source_engine = Column(String(50), nullable=True)

186 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

187

188 # Relationships

189 paper = relationship("Paper", back_populates="appearances")

190 resource = relationship(

191 "ResearchResource", back_populates="paper_appearance"

192 )

193

194 def __repr__(self):

195 return (

196 f"<PaperAppearance("

197 f"paper_id={self.paper_id}, "

198 f"resource_id={self.resource_id})>"

199 )