Coverage for src/local_deep_research/journal_quality/data

1"""JabRef journal abbreviations data source.

3Downloads ~14 small CSV files from the JabRef GitHub repo and merges

4them into a single gzipped JSON mapping abbreviations → full names.

5"""

7from __future__ import annotations

9import csv

10import gzip

11import io

12import json

13from pathlib import Path

15from loguru import logger

17from .base import DataSource

19_JABREF_BASE = (

20 "https://raw.githubusercontent.com/JabRef/abbrv.jabref.org/main/journals"

21)

22_JABREF_FILES = [

23 "journal_abbreviations_entrez.csv",

24 "journal_abbreviations_ubc.csv",

25 "journal_abbreviations_lifescience.csv",

26 "journal_abbreviations_mechanical.csv",

27 "journal_abbreviations_mathematics.csv",

28 "journal_abbreviations_medicus.csv",

29 "journal_abbreviations_ams.csv",

30 "journal_abbreviations_general.csv",

31 "journal_abbreviations_acs.csv",

32 "journal_abbreviations_geology_physics.csv",

33 "journal_abbreviations_ieee.csv",

34 "journal_abbreviations_meteorology.csv",

35 "journal_abbreviations_astronomy.csv",

36 "journal_abbreviations_sociology.csv",

37]

40class JabRefSource(DataSource):

41 key = "jabref" # gitleaks:allow

42 name = "JabRef abbreviations"

43 url = "https://github.com/JabRef/abbrv.jabref.org"

44 dataset_url = (

45 "https://github.com/JabRef/abbrv.jabref.org/tree/main/journals"

46 )

47 license = "CC0 1.0"

48 license_url = "https://creativecommons.org/publicdomain/zero/1.0/"

49 description = "~66K journal abbreviation → full name mappings"

50 filename = "jabref_abbreviations.json.gz"

51 count_label = "abbreviations"

52 auto_download = True # ~0.5 MB total across 14 files

53 required = False

54 approx_size_mb = 0.5

56 def fetch(self, data_dir: Path, progress_cb=None) -> int:

57 from ...security.safe_requests import (

58 safe_get_with_retries as safe_get,

59 )

61 abbrev_to_full: dict[str, str] = {}

63 for filename in _JABREF_FILES:

64 url = f"{_JABREF_BASE}/{filename}"

65 try:

66 resp = safe_get(url, timeout=30, consume_body=True)

67 resp.raise_for_status()

68 reader = csv.reader(io.StringIO(resp.text))

69 for row in reader:

70 if len(row) < 2:

71 continue

72 full_name = row[0].strip().strip('"')

73 abbreviation = row[1].strip().strip('"')

74 if full_name and abbreviation and full_name != abbreviation:

75 abbrev_lower = abbreviation.lower()

76 # Last-writer-wins across 14 source CSVs. Log the

77 # collision at debug level (one per file change,

78 # not per row) so operators can audit which

79 # source resolves a given abbreviation — not a

80 # warning, because collisions are expected.

81 if (

82 abbrev_lower in abbrev_to_full

83 and abbrev_to_full[abbrev_lower] != full_name

84 ):

85 logger.debug(

86 "jabref collision "

87 f"[{filename}] {abbrev_lower!r}: "

88 f"{abbrev_to_full[abbrev_lower]!r} → "

89 f"{full_name!r}"

90 )

91 abbrev_to_full[abbrev_lower] = full_name

92 # Also store without dots: "Phys Rev Lett" → same

93 no_dots = abbreviation.replace(".", "").strip().lower()

94 if no_dots != abbreviation.lower():

95 if (

96 no_dots in abbrev_to_full

97 and abbrev_to_full[no_dots] != full_name

98 ):

99 logger.debug(

100 "jabref collision (no-dots) "

101 f"[{filename}] {no_dots!r}: "

102 f"{abbrev_to_full[no_dots]!r} → "

103 f"{full_name!r}"

104 )

105 abbrev_to_full[no_dots] = full_name

106 except Exception:

107 # Preserve traceback — operators diagnosing partial

108 # fetch failures need the exception type (timeout,

109 # SSRF block, decode error, etc.), not just the

110 # filename. The outer loop tolerates per-file

111 # failures, so this is a non-fatal warning.

112 logger.exception(f"Failed to fetch JabRef file: {filename}")

113 continue

114

115 # Sanity check: if every single file failed (e.g., GitHub raw

116 # CDN unreachable), don't silently overwrite existing abbreviation

117 # data with an empty mapping.

118 _MIN_JABREF_ABBREVS = 100

119 if len(abbrev_to_full) < _MIN_JABREF_ABBREVS:

120 raise RuntimeError(

121 f"JabRef: suspiciously few abbreviations "

122 f"({len(abbrev_to_full)} < {_MIN_JABREF_ABBREVS}); "

123 "refusing to overwrite existing data"

124 )

125

126 output = data_dir / self.filename

127 tmp = data_dir / f"{self.filename}.tmp"

128 with gzip.open(tmp, "wt", encoding="utf-8") as f:

129 json.dump({"abbrev_to_full": abbrev_to_full}, f)

130 tmp.rename(output)

131

132 logger.info(

133 f"JabRef: saved {len(abbrev_to_full)} abbreviation mappings"

134 )

135 return len(abbrev_to_full)

Coverage for src/local_deep_research/journal_quality/data_sources/jabref.py: 32%

60 statements