Coverage for src/local_deep_research/database/models/citation.py: 94%

32 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Deduplicated academic paper metadata and appearance tracking. 

3 

4``Paper`` stores bibliographic data once per unique paper (deduped by 

5DOI / arXiv ID / PMID). ``PaperAppearance`` links papers to the 

6research resources that found them (many-to-one: many resources can 

7reference the same paper). 

8 

9Journal quality is NEVER stored here — always derived at query time 

10via ``journal_id`` → ``journals`` table → reference DB. 

11""" 

12 

13from sqlalchemy import ( 

14 JSON, 

15 Column, 

16 ForeignKey, 

17 Index, 

18 Integer, 

19 String, 

20) 

21from sqlalchemy.orm import relationship 

22from sqlalchemy_utc import UtcDateTime, utcnow 

23 

24from .base import Base 

25 

26 

27class Paper(Base): 

28 """A unique academic paper, deduplicated by DOI/arXiv ID/PMID. 

29 

30 Created the first time a paper is encountered in any research 

31 session. If the same paper is found again (same DOI, or same 

32 arXiv ID when DOI is absent), the existing row is reused and 

33 any missing identifiers are merged in. 

34 

35 Schema is minimal by design: only the columns used for dedup 

36 lookups (``doi``, ``arxiv_id``, ``pmid``), dashboard joins 

37 (``journal_id``, ``container_title``), and publication year for 

38 indexed filtering are real columns. Everything else (authors, 

39 volume, CSL-JSON, ...) goes into the ``paper_metadata`` JSON blob, 

40 matching the hybrid relational-JSON pattern used by OpenAlex and 

41 Crossref. 

42 

43 ``container_title`` is the cleaned journal name the filter used to 

44 score the paper (post-regex-clean + abbreviation-expand + optional 

45 LLM-relabel). Always populated when the filter scored the journal. 

46 Indexed so the dashboard can GROUP BY it and batch-enrich from the 

47 shared read-only reference DB. 

48 

49 No per-Paper quality column by design: a frozen snapshot would go 

50 stale if a journal is re-scored (new LLM model, bug fix, manual 

51 override). Instead, the dashboard resolves current quality live — 

52 Tier 4 via ``journals.quality`` keyed by NFKC-normalized 

53 ``container_title``, Tier 1-3 via the bundled reference DB. 

54 Don't re-introduce a ``journal_quality`` column here. 

55 """ 

56 

57 __tablename__ = "papers" 

58 

59 id = Column(Integer, primary_key=True, autoincrement=True) 

60 

61 # Academic identifiers — used as waterfall dedup keys. 

62 # UNIQUE constraints prevent concurrent writers from creating 

63 # duplicate rows for the same paper. SQLite allows multiple NULL 

64 # values in a UNIQUE column (standard SQL behavior), so papers 

65 # without identifiers are still permitted — they just can't be 

66 # deduplicated via these keys. `unique=True` already creates a 

67 # backing index, so no separate `index=True` or explicit Index 

68 # entry is needed for these columns. 

69 # 

70 # KNOWN-DEFERRED: String(255) is sufficient for the vast majority 

71 # of real-world DOIs (CrossRef recommends <= 200 chars). A 

72 # pathological dataset DOI approaching 2000 chars would fail on 

73 # insert rather than silently corrupt. Tracked as a post-merge 

74 # follow-up to bump to String(512) with a truncation guard in 

75 # _extract_doi. 

76 doi = Column(String(255), nullable=True, unique=True) 

77 arxiv_id = Column(String(100), nullable=True, unique=True) 

78 pmid = Column(String(50), nullable=True, unique=True) 

79 

80 # Venue link for quality lookups (quality derived at query time). 

81 # Named index declared in __table_args__ below. 

82 journal_id = Column( 

83 Integer, 

84 ForeignKey("journals.id", ondelete="SET NULL"), 

85 nullable=True, 

86 ) 

87 

88 # Cleaned journal name (post-regex-clean / abbreviation-expand / 

89 # optional LLM-relabel) that keyed the filter's score. Dashboard 

90 # GROUP BY key. Named index declared in __table_args__ below. 

91 container_title = Column(String(500), nullable=True) 

92 

93 # NOTE: no ``journal_quality`` column — see class docstring above. 

94 # Quality is resolved live at render time from journals.quality 

95 # (Tier 4) or the bundled reference DB (Tier 1-3) so that a 

96 # re-scored journal propagates to old papers automatically. 

97 

98 # Publication year. Promoted out of the metadata JSON blob into a 

99 # first-class integer column so the dashboard can filter/group/sort 

100 # by year without paying for json_extract on every row. Named index 

101 # declared in __table_args__ below. Always written alongside the 

102 # JSON copy in paper_metadata so existing readers keep working; 

103 # this column is a denormalized index surface. 

104 year = Column(Integer, nullable=True) 

105 

106 # Bibliographic fields (authors, volume, pages, container_title, 

107 # publisher, item_type, pmcid, csl_json, ...) stored as a single 

108 # JSON blob. Note: ``year`` is ALSO duplicated here as the CSL-JSON 

109 # source of truth; the first-class ``year`` column above is a 

110 # denormalized copy for indexed queries. Python attribute is 

111 # ``paper_metadata`` to avoid SQLAlchemy's reserved ``metadata`` 

112 # attribute on declarative Base; the underlying column is still 

113 # named ``metadata`` in SQL. Mirrors the ResearchResource pattern 

114 # (``resource_metadata = Column("metadata", JSON)``). 

115 paper_metadata = Column("metadata", JSON, nullable=True) 

116 

117 # Timestamps 

118 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

119 updated_at = Column( 

120 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

121 ) 

122 

123 # Relationships 

124 appearances = relationship( 

125 "PaperAppearance", 

126 back_populates="paper", 

127 cascade="all, delete-orphan", 

128 ) 

129 journal = relationship("Journal", backref="papers") 

130 

131 __table_args__ = ( 

132 Index("idx_papers_journal", "journal_id"), 

133 Index("idx_papers_container_title", "container_title"), 

134 Index("idx_papers_year", "year"), 

135 ) 

136 

137 def __repr__(self): 

138 return ( 

139 f"<Paper(" 

140 f"id={self.id}, " 

141 f"doi={self.doi!r}, " 

142 f"journal_id={self.journal_id})>" 

143 ) 

144 

145 

146class PaperAppearance(Base): 

147 """Links a Paper to the ResearchResource that found it. 

148 

149 Each search result (ResearchResource) can reference at most one 

150 paper. The same paper can appear across many research sessions, 

151 each creating a separate PaperAppearance row — but the paper's 

152 metadata is stored only once in the ``papers`` table. 

153 """ 

154 

155 __tablename__ = "paper_appearances" 

156 

157 id = Column(Integer, primary_key=True, autoincrement=True) 

158 paper_id = Column( 

159 Integer, 

160 ForeignKey("papers.id", ondelete="CASCADE"), 

161 nullable=False, 

162 index=True, 

163 ) 

164 # unique=True enforces: each ResearchResource row appears in at 

165 # most one PaperAppearance. This is intentional — a resource 

166 # represents ONE search-result row for ONE paper. If a research 

167 # session retries and creates a new ResearchResource, it gets a 

168 # new resource_id and can link to a (possibly different) paper 

169 # without conflict. Removing this UNIQUE would allow the same 

170 # resource to claim multiple papers, which is nonsensical in the 

171 # domain model. unique=True already creates a backing unique 

172 # index — no separate index=True or explicit Index() entry. 

173 resource_id = Column( 

174 Integer, 

175 ForeignKey("research_resources.id", ondelete="CASCADE"), 

176 nullable=False, 

177 unique=True, 

178 ) 

179 # Which search engine found this paper for this resource. 

180 # KNOWN-DEFERRED: currently written by normalize_citation output 

181 # but not read by any route or service. Retained for future 

182 # per-engine analytics (e.g., "how many predatory journals via 

183 # arxiv vs openalex"). Do not remove without confirming no planned 

184 # consumer exists. 

185 source_engine = Column(String(50), nullable=True) 

186 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

187 

188 # Relationships 

189 paper = relationship("Paper", back_populates="appearances") 

190 resource = relationship( 

191 "ResearchResource", back_populates="paper_appearance" 

192 ) 

193 

194 def __repr__(self): 

195 return ( 

196 f"<PaperAppearance(" 

197 f"paper_id={self.paper_id}, " 

198 f"resource_id={self.resource_id})>" 

199 )