Coverage for src/local_deep_research/journal_quality/data_sources/base.py: 61%
37 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""DataSource base class for academic data downloads.
3Each external dataset (OpenAlex sources, DOAJ journals, Stop Predatory
4Journals, JabRef abbreviations, …) is a subclass of `DataSource` that
5declares its metadata as class attributes and implements the `fetch()`
6method. The `data_sources` package's `ALL_SOURCES` registry then drives
7the bulk download flow, the dashboard banner status endpoint, and the
8lazy-load auto-download path in `JournalDataManager`.
9"""
11from __future__ import annotations
13from abc import ABC, abstractmethod
14from pathlib import Path
15from typing import Any
17from loguru import logger
20class DataSource(ABC):
21 """Abstract base class for one downloadable academic data source.
23 Subclasses MUST override the metadata class attributes (key, name,
24 url, license, license_url, description, filename, count_label) and
25 implement `fetch()`. They MAY override `auto_download`, `required`,
26 and `approx_size_mb`.
27 """
29 # ── metadata (subclasses MUST override) ────────────────────────
30 key: str = ""
31 name: str = ""
32 url: str = ""
33 # Direct link to the actual dataset/dump that fetch() downloads —
34 # the bulk file or manifest, not the project homepage. Surfaced
35 # on the dashboard so users can inspect the upstream artifact.
36 dataset_url: str = ""
37 license: str = ""
38 license_url: str = ""
39 description: str = ""
40 filename: str = ""
41 count_label: str = ""
43 # ── policy (sensible defaults) ─────────────────────────────────
44 # If True, lazy-load callers will fetch the file on demand. Set
45 # only for small files (<1 MB) so first-use latency stays low.
46 auto_download: bool = False
47 # If True, a fetch() failure inside the bulk download loop is
48 # fatal — the loop returns early with an error message. Other
49 # sources are best-effort (logged and skipped).
50 required: bool = False
51 # Informational; surfaced to UIs that want to warn the user
52 # about download size before they hit the button.
53 approx_size_mb: float = 0.0
55 # ── methods (subclasses override only fetch) ───────────────────
57 @abstractmethod
58 def fetch(self, data_dir: Path, progress_cb=None) -> int:
59 """Download from upstream and write `self.filename` into data_dir.
61 Subclasses are responsible for HTTP, parsing, and atomic write
62 (download to a `.tmp` file then rename). Should raise on
63 unrecoverable network/parse errors so the caller can decide
64 whether to abort the bulk download or continue best-effort.
66 Args:
67 data_dir: target directory for the compact output file.
68 progress_cb: optional `Callable[[int, int, str], None]`
69 that chunk-processing sources call periodically as
70 ``progress_cb(done, total, detail)``. Used by the
71 dashboard to drive a live per-source progress bar.
72 One-shot sources (no loops) can ignore this.
74 Returns:
75 Number of records fetched (used in success messages).
76 """
78 def is_present(self, data_dir: Path) -> bool:
79 """True if the data file already exists in `data_dir`."""
80 return (data_dir / self.filename).exists()
82 def ensure(self, data_dir: Path) -> bool:
83 """Ensure the data file is present, optionally fetching on demand.
85 - If already present → return True (no work).
86 - If missing AND `auto_download=True` → fetch and return whether
87 the file is present after the fetch attempt.
88 - If missing AND `auto_download=False` → return False without
89 touching the network. Caller is expected to drive the bulk
90 download flow (dashboard "Download Data" button) instead.
92 Idempotent and safe to call repeatedly. Logs but does not
93 re-raise exceptions from `fetch()`.
94 """
95 if self.is_present(data_dir):
96 return True
97 if not self.auto_download:
98 return False
99 logger.info(
100 f"{self.name} data not found — fetching from upstream "
101 f"(~{self.approx_size_mb:.1f} MB)..."
102 )
103 try:
104 data_dir.mkdir(parents=True, exist_ok=True)
105 self.fetch(data_dir)
106 except Exception:
107 logger.exception(f"Failed to fetch {self.name}")
108 return False
109 return self.is_present(data_dir)
111 def status_dict(self, data_dir: Path) -> dict[str, Any]:
112 """Return the dict shape consumed by the dashboard banner JS.
114 Mirrors the entries the JS in `journal_quality.html` expects:
115 `key`, `name`, `url`, `license`, `license_url`, `description`,
116 `file`, `present`.
117 """
118 return {
119 "key": self.key,
120 "name": self.name,
121 "url": self.url,
122 "dataset_url": self.dataset_url,
123 "license": self.license,
124 "license_url": self.license_url,
125 "description": self.description,
126 "file": self.filename,
127 "present": self.is_present(data_dir),
128 }