Coverage for src/local_deep_research/journal_quality/data_sources/jabref.py: 32%

60 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""JabRef journal abbreviations data source. 

2 

3Downloads ~14 small CSV files from the JabRef GitHub repo and merges 

4them into a single gzipped JSON mapping abbreviations → full names. 

5""" 

6 

7from __future__ import annotations 

8 

9import csv 

10import gzip 

11import io 

12import json 

13from pathlib import Path 

14 

15from loguru import logger 

16 

17from .base import DataSource 

18 

19_JABREF_BASE = ( 

20 "https://raw.githubusercontent.com/JabRef/abbrv.jabref.org/main/journals" 

21) 

22_JABREF_FILES = [ 

23 "journal_abbreviations_entrez.csv", 

24 "journal_abbreviations_ubc.csv", 

25 "journal_abbreviations_lifescience.csv", 

26 "journal_abbreviations_mechanical.csv", 

27 "journal_abbreviations_mathematics.csv", 

28 "journal_abbreviations_medicus.csv", 

29 "journal_abbreviations_ams.csv", 

30 "journal_abbreviations_general.csv", 

31 "journal_abbreviations_acs.csv", 

32 "journal_abbreviations_geology_physics.csv", 

33 "journal_abbreviations_ieee.csv", 

34 "journal_abbreviations_meteorology.csv", 

35 "journal_abbreviations_astronomy.csv", 

36 "journal_abbreviations_sociology.csv", 

37] 

38 

39 

40class JabRefSource(DataSource): 

41 key = "jabref" # gitleaks:allow 

42 name = "JabRef abbreviations" 

43 url = "https://github.com/JabRef/abbrv.jabref.org" 

44 dataset_url = ( 

45 "https://github.com/JabRef/abbrv.jabref.org/tree/main/journals" 

46 ) 

47 license = "CC0 1.0" 

48 license_url = "https://creativecommons.org/publicdomain/zero/1.0/" 

49 description = "~66K journal abbreviation → full name mappings" 

50 filename = "jabref_abbreviations.json.gz" 

51 count_label = "abbreviations" 

52 auto_download = True # ~0.5 MB total across 14 files 

53 required = False 

54 approx_size_mb = 0.5 

55 

56 def fetch(self, data_dir: Path, progress_cb=None) -> int: 

57 from ...security.safe_requests import ( 

58 safe_get_with_retries as safe_get, 

59 ) 

60 

61 abbrev_to_full: dict[str, str] = {} 

62 

63 for filename in _JABREF_FILES: 

64 url = f"{_JABREF_BASE}/{filename}" 

65 try: 

66 resp = safe_get(url, timeout=30, consume_body=True) 

67 resp.raise_for_status() 

68 reader = csv.reader(io.StringIO(resp.text)) 

69 for row in reader: 

70 if len(row) < 2: 

71 continue 

72 full_name = row[0].strip().strip('"') 

73 abbreviation = row[1].strip().strip('"') 

74 if full_name and abbreviation and full_name != abbreviation: 

75 abbrev_lower = abbreviation.lower() 

76 # Last-writer-wins across 14 source CSVs. Log the 

77 # collision at debug level (one per file change, 

78 # not per row) so operators can audit which 

79 # source resolves a given abbreviation — not a 

80 # warning, because collisions are expected. 

81 if ( 

82 abbrev_lower in abbrev_to_full 

83 and abbrev_to_full[abbrev_lower] != full_name 

84 ): 

85 logger.debug( 

86 "jabref collision " 

87 f"[{filename}] {abbrev_lower!r}: " 

88 f"{abbrev_to_full[abbrev_lower]!r}" 

89 f"{full_name!r}" 

90 ) 

91 abbrev_to_full[abbrev_lower] = full_name 

92 # Also store without dots: "Phys Rev Lett" → same 

93 no_dots = abbreviation.replace(".", "").strip().lower() 

94 if no_dots != abbreviation.lower(): 

95 if ( 

96 no_dots in abbrev_to_full 

97 and abbrev_to_full[no_dots] != full_name 

98 ): 

99 logger.debug( 

100 "jabref collision (no-dots) " 

101 f"[{filename}] {no_dots!r}: " 

102 f"{abbrev_to_full[no_dots]!r}" 

103 f"{full_name!r}" 

104 ) 

105 abbrev_to_full[no_dots] = full_name 

106 except Exception: 

107 # Preserve traceback — operators diagnosing partial 

108 # fetch failures need the exception type (timeout, 

109 # SSRF block, decode error, etc.), not just the 

110 # filename. The outer loop tolerates per-file 

111 # failures, so this is a non-fatal warning. 

112 logger.exception(f"Failed to fetch JabRef file: {filename}") 

113 continue 

114 

115 # Sanity check: if every single file failed (e.g., GitHub raw 

116 # CDN unreachable), don't silently overwrite existing abbreviation 

117 # data with an empty mapping. 

118 _MIN_JABREF_ABBREVS = 100 

119 if len(abbrev_to_full) < _MIN_JABREF_ABBREVS: 

120 raise RuntimeError( 

121 f"JabRef: suspiciously few abbreviations " 

122 f"({len(abbrev_to_full)} < {_MIN_JABREF_ABBREVS}); " 

123 "refusing to overwrite existing data" 

124 ) 

125 

126 output = data_dir / self.filename 

127 tmp = data_dir / f"{self.filename}.tmp" 

128 with gzip.open(tmp, "wt", encoding="utf-8") as f: 

129 json.dump({"abbrev_to_full": abbrev_to_full}, f) 

130 tmp.rename(output) 

131 

132 logger.info( 

133 f"JabRef: saved {len(abbrev_to_full)} abbreviation mappings" 

134 ) 

135 return len(abbrev_to_full)