Coverage for src/local_deep_research/journal_quality/data

1"""DataSource base class for academic data downloads.

3Each external dataset (OpenAlex sources, DOAJ journals, Stop Predatory

4Journals, JabRef abbreviations, …) is a subclass of `DataSource` that

5declares its metadata as class attributes and implements the `fetch()`

6method. The `data_sources` package's `ALL_SOURCES` registry then drives

7the bulk download flow, the dashboard banner status endpoint, and the

8lazy-load auto-download path in `JournalDataManager`.

9"""

11from __future__ import annotations

13from abc import ABC, abstractmethod

14from pathlib import Path

15from typing import Any

17from loguru import logger

20class DataSource(ABC):

21 """Abstract base class for one downloadable academic data source.

23 Subclasses MUST override the metadata class attributes (key, name,

24 url, license, license_url, description, filename, count_label) and

25 implement `fetch()`. They MAY override `auto_download`, `required`,

26 and `approx_size_mb`.

27 """

29 # ── metadata (subclasses MUST override) ────────────────────────

30 key: str = ""

31 name: str = ""

32 url: str = ""

33 # Direct link to the actual dataset/dump that fetch() downloads —

34 # the bulk file or manifest, not the project homepage. Surfaced

35 # on the dashboard so users can inspect the upstream artifact.

36 dataset_url: str = ""

37 license: str = ""

38 license_url: str = ""

39 description: str = ""

40 filename: str = ""

41 count_label: str = ""

43 # ── policy (sensible defaults) ─────────────────────────────────

44 # If True, lazy-load callers will fetch the file on demand. Set

45 # only for small files (<1 MB) so first-use latency stays low.

46 auto_download: bool = False

47 # If True, a fetch() failure inside the bulk download loop is

48 # fatal — the loop returns early with an error message. Other

49 # sources are best-effort (logged and skipped).

50 required: bool = False

51 # Informational; surfaced to UIs that want to warn the user

52 # about download size before they hit the button.

53 approx_size_mb: float = 0.0

55 # ── methods (subclasses override only fetch) ───────────────────

57 @abstractmethod

58 def fetch(self, data_dir: Path, progress_cb=None) -> int:

59 """Download from upstream and write `self.filename` into data_dir.

61 Subclasses are responsible for HTTP, parsing, and atomic write

62 (download to a `.tmp` file then rename). Should raise on

63 unrecoverable network/parse errors so the caller can decide

64 whether to abort the bulk download or continue best-effort.

66 Args:

67 data_dir: target directory for the compact output file.

68 progress_cb: optional `Callable[[int, int, str], None]`

69 that chunk-processing sources call periodically as

70 ``progress_cb(done, total, detail)``. Used by the

71 dashboard to drive a live per-source progress bar.

72 One-shot sources (no loops) can ignore this.

74 Returns:

75 Number of records fetched (used in success messages).

76 """

78 def is_present(self, data_dir: Path) -> bool:

79 """True if the data file already exists in `data_dir`."""

80 return (data_dir / self.filename).exists()

82 def ensure(self, data_dir: Path) -> bool:

83 """Ensure the data file is present, optionally fetching on demand.

85 - If already present → return True (no work).

86 - If missing AND `auto_download=True` → fetch and return whether

87 the file is present after the fetch attempt.

88 - If missing AND `auto_download=False` → return False without

89 touching the network. Caller is expected to drive the bulk

90 download flow (dashboard "Download Data" button) instead.

92 Idempotent and safe to call repeatedly. Logs but does not

93 re-raise exceptions from `fetch()`.

94 """

95 if self.is_present(data_dir):

96 return True

97 if not self.auto_download:

98 return False

99 logger.info(

100 f"{self.name} data not found — fetching from upstream "

101 f"(~{self.approx_size_mb:.1f} MB)..."

102 )

103 try:

104 data_dir.mkdir(parents=True, exist_ok=True)

105 self.fetch(data_dir)

106 except Exception:

107 logger.exception(f"Failed to fetch {self.name}")

108 return False

109 return self.is_present(data_dir)

110

111 def status_dict(self, data_dir: Path) -> dict[str, Any]:

112 """Return the dict shape consumed by the dashboard banner JS.

113

114 Mirrors the entries the JS in `journal_quality.html` expects:

115 `key`, `name`, `url`, `license`, `license_url`, `description`,

116 `file`, `present`.

117 """

118 return {

119 "key": self.key,

120 "name": self.name,

121 "url": self.url,

122 "dataset_url": self.dataset_url,

123 "license": self.license,

124 "license_url": self.license_url,

125 "description": self.description,

126 "file": self.filename,

127 "present": self.is_present(data_dir),

128 }

Coverage for src/local_deep_research/journal_quality/data_sources/base.py: 61%

37 statements