Coverage for src/local_deep_research/journal_quality/data_sources/base.py: 61%

37 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""DataSource base class for academic data downloads. 

2 

3Each external dataset (OpenAlex sources, DOAJ journals, Stop Predatory 

4Journals, JabRef abbreviations, …) is a subclass of `DataSource` that 

5declares its metadata as class attributes and implements the `fetch()` 

6method. The `data_sources` package's `ALL_SOURCES` registry then drives 

7the bulk download flow, the dashboard banner status endpoint, and the 

8lazy-load auto-download path in `JournalDataManager`. 

9""" 

10 

11from __future__ import annotations 

12 

13from abc import ABC, abstractmethod 

14from pathlib import Path 

15from typing import Any 

16 

17from loguru import logger 

18 

19 

20class DataSource(ABC): 

21 """Abstract base class for one downloadable academic data source. 

22 

23 Subclasses MUST override the metadata class attributes (key, name, 

24 url, license, license_url, description, filename, count_label) and 

25 implement `fetch()`. They MAY override `auto_download`, `required`, 

26 and `approx_size_mb`. 

27 """ 

28 

29 # ── metadata (subclasses MUST override) ──────────────────────── 

30 key: str = "" 

31 name: str = "" 

32 url: str = "" 

33 # Direct link to the actual dataset/dump that fetch() downloads — 

34 # the bulk file or manifest, not the project homepage. Surfaced 

35 # on the dashboard so users can inspect the upstream artifact. 

36 dataset_url: str = "" 

37 license: str = "" 

38 license_url: str = "" 

39 description: str = "" 

40 filename: str = "" 

41 count_label: str = "" 

42 

43 # ── policy (sensible defaults) ───────────────────────────────── 

44 # If True, lazy-load callers will fetch the file on demand. Set 

45 # only for small files (<1 MB) so first-use latency stays low. 

46 auto_download: bool = False 

47 # If True, a fetch() failure inside the bulk download loop is 

48 # fatal — the loop returns early with an error message. Other 

49 # sources are best-effort (logged and skipped). 

50 required: bool = False 

51 # Informational; surfaced to UIs that want to warn the user 

52 # about download size before they hit the button. 

53 approx_size_mb: float = 0.0 

54 

55 # ── methods (subclasses override only fetch) ─────────────────── 

56 

57 @abstractmethod 

58 def fetch(self, data_dir: Path, progress_cb=None) -> int: 

59 """Download from upstream and write `self.filename` into data_dir. 

60 

61 Subclasses are responsible for HTTP, parsing, and atomic write 

62 (download to a `.tmp` file then rename). Should raise on 

63 unrecoverable network/parse errors so the caller can decide 

64 whether to abort the bulk download or continue best-effort. 

65 

66 Args: 

67 data_dir: target directory for the compact output file. 

68 progress_cb: optional `Callable[[int, int, str], None]` 

69 that chunk-processing sources call periodically as 

70 ``progress_cb(done, total, detail)``. Used by the 

71 dashboard to drive a live per-source progress bar. 

72 One-shot sources (no loops) can ignore this. 

73 

74 Returns: 

75 Number of records fetched (used in success messages). 

76 """ 

77 

78 def is_present(self, data_dir: Path) -> bool: 

79 """True if the data file already exists in `data_dir`.""" 

80 return (data_dir / self.filename).exists() 

81 

82 def ensure(self, data_dir: Path) -> bool: 

83 """Ensure the data file is present, optionally fetching on demand. 

84 

85 - If already present → return True (no work). 

86 - If missing AND `auto_download=True` → fetch and return whether 

87 the file is present after the fetch attempt. 

88 - If missing AND `auto_download=False` → return False without 

89 touching the network. Caller is expected to drive the bulk 

90 download flow (dashboard "Download Data" button) instead. 

91 

92 Idempotent and safe to call repeatedly. Logs but does not 

93 re-raise exceptions from `fetch()`. 

94 """ 

95 if self.is_present(data_dir): 

96 return True 

97 if not self.auto_download: 

98 return False 

99 logger.info( 

100 f"{self.name} data not found — fetching from upstream " 

101 f"(~{self.approx_size_mb:.1f} MB)..." 

102 ) 

103 try: 

104 data_dir.mkdir(parents=True, exist_ok=True) 

105 self.fetch(data_dir) 

106 except Exception: 

107 logger.exception(f"Failed to fetch {self.name}") 

108 return False 

109 return self.is_present(data_dir) 

110 

111 def status_dict(self, data_dir: Path) -> dict[str, Any]: 

112 """Return the dict shape consumed by the dashboard banner JS. 

113 

114 Mirrors the entries the JS in `journal_quality.html` expects: 

115 `key`, `name`, `url`, `license`, `license_url`, `description`, 

116 `file`, `present`. 

117 """ 

118 return { 

119 "key": self.key, 

120 "name": self.name, 

121 "url": self.url, 

122 "dataset_url": self.dataset_url, 

123 "license": self.license, 

124 "license_url": self.license_url, 

125 "description": self.description, 

126 "file": self.filename, 

127 "present": self.is_present(data_dir), 

128 }