Coverage for src/local_deep_research/journal_quality/data_sources/jabref.py: 32%
60 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""JabRef journal abbreviations data source.
3Downloads ~14 small CSV files from the JabRef GitHub repo and merges
4them into a single gzipped JSON mapping abbreviations → full names.
5"""
7from __future__ import annotations
9import csv
10import gzip
11import io
12import json
13from pathlib import Path
15from loguru import logger
17from .base import DataSource
19_JABREF_BASE = (
20 "https://raw.githubusercontent.com/JabRef/abbrv.jabref.org/main/journals"
21)
22_JABREF_FILES = [
23 "journal_abbreviations_entrez.csv",
24 "journal_abbreviations_ubc.csv",
25 "journal_abbreviations_lifescience.csv",
26 "journal_abbreviations_mechanical.csv",
27 "journal_abbreviations_mathematics.csv",
28 "journal_abbreviations_medicus.csv",
29 "journal_abbreviations_ams.csv",
30 "journal_abbreviations_general.csv",
31 "journal_abbreviations_acs.csv",
32 "journal_abbreviations_geology_physics.csv",
33 "journal_abbreviations_ieee.csv",
34 "journal_abbreviations_meteorology.csv",
35 "journal_abbreviations_astronomy.csv",
36 "journal_abbreviations_sociology.csv",
37]
40class JabRefSource(DataSource):
41 key = "jabref" # gitleaks:allow
42 name = "JabRef abbreviations"
43 url = "https://github.com/JabRef/abbrv.jabref.org"
44 dataset_url = (
45 "https://github.com/JabRef/abbrv.jabref.org/tree/main/journals"
46 )
47 license = "CC0 1.0"
48 license_url = "https://creativecommons.org/publicdomain/zero/1.0/"
49 description = "~66K journal abbreviation → full name mappings"
50 filename = "jabref_abbreviations.json.gz"
51 count_label = "abbreviations"
52 auto_download = True # ~0.5 MB total across 14 files
53 required = False
54 approx_size_mb = 0.5
56 def fetch(self, data_dir: Path, progress_cb=None) -> int:
57 from ...security.safe_requests import (
58 safe_get_with_retries as safe_get,
59 )
61 abbrev_to_full: dict[str, str] = {}
63 for filename in _JABREF_FILES:
64 url = f"{_JABREF_BASE}/{filename}"
65 try:
66 resp = safe_get(url, timeout=30, consume_body=True)
67 resp.raise_for_status()
68 reader = csv.reader(io.StringIO(resp.text))
69 for row in reader:
70 if len(row) < 2:
71 continue
72 full_name = row[0].strip().strip('"')
73 abbreviation = row[1].strip().strip('"')
74 if full_name and abbreviation and full_name != abbreviation:
75 abbrev_lower = abbreviation.lower()
76 # Last-writer-wins across 14 source CSVs. Log the
77 # collision at debug level (one per file change,
78 # not per row) so operators can audit which
79 # source resolves a given abbreviation — not a
80 # warning, because collisions are expected.
81 if (
82 abbrev_lower in abbrev_to_full
83 and abbrev_to_full[abbrev_lower] != full_name
84 ):
85 logger.debug(
86 "jabref collision "
87 f"[{filename}] {abbrev_lower!r}: "
88 f"{abbrev_to_full[abbrev_lower]!r} → "
89 f"{full_name!r}"
90 )
91 abbrev_to_full[abbrev_lower] = full_name
92 # Also store without dots: "Phys Rev Lett" → same
93 no_dots = abbreviation.replace(".", "").strip().lower()
94 if no_dots != abbreviation.lower():
95 if (
96 no_dots in abbrev_to_full
97 and abbrev_to_full[no_dots] != full_name
98 ):
99 logger.debug(
100 "jabref collision (no-dots) "
101 f"[{filename}] {no_dots!r}: "
102 f"{abbrev_to_full[no_dots]!r} → "
103 f"{full_name!r}"
104 )
105 abbrev_to_full[no_dots] = full_name
106 except Exception:
107 # Preserve traceback — operators diagnosing partial
108 # fetch failures need the exception type (timeout,
109 # SSRF block, decode error, etc.), not just the
110 # filename. The outer loop tolerates per-file
111 # failures, so this is a non-fatal warning.
112 logger.exception(f"Failed to fetch JabRef file: {filename}")
113 continue
115 # Sanity check: if every single file failed (e.g., GitHub raw
116 # CDN unreachable), don't silently overwrite existing abbreviation
117 # data with an empty mapping.
118 _MIN_JABREF_ABBREVS = 100
119 if len(abbrev_to_full) < _MIN_JABREF_ABBREVS:
120 raise RuntimeError(
121 f"JabRef: suspiciously few abbreviations "
122 f"({len(abbrev_to_full)} < {_MIN_JABREF_ABBREVS}); "
123 "refusing to overwrite existing data"
124 )
126 output = data_dir / self.filename
127 tmp = data_dir / f"{self.filename}.tmp"
128 with gzip.open(tmp, "wt", encoding="utf-8") as f:
129 json.dump({"abbrev_to_full": abbrev_to_full}, f)
130 tmp.rename(output)
132 logger.info(
133 f"JabRef: saved {len(abbrev_to_full)} abbreviation mappings"
134 )
135 return len(abbrev_to_full)