Coverage for src/local_deep_research/content_fetcher/url

1"""

2URL Classifier for content fetching.

4Determines the type of URL to route to the appropriate downloader.

5"""

7import re

8from enum import Enum

9from typing import Optional

10from urllib.parse import urlparse

13class URLType(Enum):

14 """Types of URLs we can handle."""

16 ARXIV = "arxiv"

17 PUBMED = "pubmed"

18 PMC = "pmc" # PubMed Central

19 SEMANTIC_SCHOLAR = "semantic_scholar"

20 BIORXIV = "biorxiv"

21 MEDRXIV = "medrxiv"

22 DOI = "doi"

23 PDF = "pdf" # Direct PDF link

24 HTML = "html" # Generic web page

25 INVALID = "invalid" # Dangerous/unsupported URL schemes

28# URL schemes that should be rejected (security risk)

29DANGEROUS_SCHEMES = {"javascript", "data", "file", "vbscript", "about"}

32class URLClassifier:

33 """Classifies URLs to determine the appropriate downloader."""

35 # ArXiv patterns

36 ARXIV_PATTERNS = [

37 r"arxiv\.org/abs/",

38 r"arxiv\.org/pdf/",

39 r"arxiv\.org/html/",

40 r"ar5iv\.org/",

41 ]

43 # PubMed patterns

44 PUBMED_PATTERNS = [

45 r"pubmed\.ncbi\.nlm\.nih\.gov/\d+",

46 r"ncbi\.nlm\.nih\.gov/pubmed/\d+",

47 ]

49 # PMC patterns (PubMed Central - full text)

50 # Note: patterns match lowercase since URLs are lowercased before matching

51 PMC_PATTERNS = [

52 r"ncbi\.nlm\.nih\.gov/pmc/articles/pmc",

53 r"europepmc\.org/article/pmc",

54 r"europepmc\.org/articles/pmc",

55 ]

57 # Semantic Scholar patterns

58 SEMANTIC_SCHOLAR_PATTERNS = [

59 r"semanticscholar\.org/paper/",

60 r"api\.semanticscholar\.org/",

61 ]

63 # BioRxiv/MedRxiv patterns

64 BIORXIV_PATTERNS = [

65 r"biorxiv\.org/content/",

66 ]

68 MEDRXIV_PATTERNS = [

69 r"medrxiv\.org/content/",

70 ]

72 # DOI patterns

73 DOI_PATTERNS = [

74 r"doi\.org/10\.",

75 r"dx\.doi\.org/10\.",

76 ]

78 @classmethod

79 def classify(cls, url: str) -> URLType:

80 """

81 Classify a URL to determine its type.

83 Args:

84 url: The URL to classify

86 Returns:

87 URLType enum indicating the type of content

88 """

89 url_lower = url.lower().strip()

91 # Check for dangerous URL schemes first (security)

92 try:

93 parsed = urlparse(url_lower)

94 if parsed.scheme in DANGEROUS_SCHEMES:

95 return URLType.INVALID

96 # Only allow http/https schemes

97 if parsed.scheme and parsed.scheme not in ("http", "https", ""):

98 return URLType.INVALID

99 except Exception:

100 return URLType.INVALID

101

102 # Check for direct PDF link first

103 if cls._is_pdf_url(url_lower):

104 return URLType.PDF

105

106 # Check specific academic sources

107 for pattern in cls.ARXIV_PATTERNS:

108 if re.search(pattern, url_lower):

109 return URLType.ARXIV

110

111 for pattern in cls.PMC_PATTERNS:

112 if re.search(pattern, url_lower):

113 return URLType.PMC

114

115 for pattern in cls.PUBMED_PATTERNS:

116 if re.search(pattern, url_lower):

117 return URLType.PUBMED

118

119 for pattern in cls.SEMANTIC_SCHOLAR_PATTERNS:

120 if re.search(pattern, url_lower):

121 return URLType.SEMANTIC_SCHOLAR

122

123 for pattern in cls.BIORXIV_PATTERNS:

124 if re.search(pattern, url_lower):

125 return URLType.BIORXIV

126

127 for pattern in cls.MEDRXIV_PATTERNS:

128 if re.search(pattern, url_lower):

129 return URLType.MEDRXIV

130

131 for pattern in cls.DOI_PATTERNS:

132 if re.search(pattern, url_lower):

133 return URLType.DOI

134

135 # Default to HTML for web pages

136 return URLType.HTML

137

138 @classmethod

139 def _is_pdf_url(cls, url: str) -> bool:

140 """Check if URL points directly to a PDF.

141

142 Note: Academic source PDFs (arXiv, bioRxiv, etc.) are handled

143 by their respective patterns, not as generic PDFs.

144 """

145 url_lower = url.lower()

146

147 # Academic sources with PDF URLs should use their specialized downloaders

148 academic_domains = [

149 "arxiv.org",

150 "biorxiv.org",

151 "medrxiv.org",

152 "ncbi.nlm.nih.gov",

153 "europepmc.org",

154 "semanticscholar.org",

155 ]

156 if any(domain in url_lower for domain in academic_domains):

157 return False

158

159 parsed = urlparse(url_lower)

160 path = parsed.path

161

162 # Check file extension

163 if path.endswith(".pdf"):

164 return True

165

166 # Check common PDF URL patterns

167 if "/pdf/" in path:

168 return True

169

170 return False

171

172 @classmethod

173 def extract_id(

174 cls, url: str, url_type: Optional[URLType] = None

175 ) -> Optional[str]:

176 """

177 Extract the identifier from a URL.

178

179 Args:

180 url: The URL to extract from

181 url_type: Optional pre-classified type

182

183 Returns:

184 Extracted ID or None

185 """

186 if url_type is None:

187 url_type = cls.classify(url)

188

189 if url_type == URLType.ARXIV:

190 # Extract arXiv ID (e.g., 2301.12345 or cond-mat/0501234)

191 match = re.search(r"(\d{4}\.\d{4,5}(?:v\d+)?)", url)

192 if match:

193 return match.group(1)

194 # Old format

195 match = re.search(r"([a-z-]+/\d{7}(?:v\d+)?)", url)

196 if match: 196 ↛ 225line 196 didn't jump to line 225 because the condition on line 196 was always true

197 return match.group(1)

198

199 elif url_type == URLType.PUBMED:

200 # Extract PMID

201 match = re.search(r"/(\d+)/?", url)

202 if match: 202 ↛ 225line 202 didn't jump to line 225 because the condition on line 202 was always true

203 return match.group(1)

204

205 elif url_type == URLType.PMC:

206 # Extract PMC ID

207 match = re.search(r"PMC(\d+)", url, re.IGNORECASE)

208 if match:

209 return f"PMC{match.group(1)}"

210

211 elif url_type == URLType.SEMANTIC_SCHOLAR:

212 # Extract paper ID (40-char hex)

213 match = re.search(

214 r"/paper/(?:[^/]+/)?([a-f0-9]{40})", url, re.IGNORECASE

215 )

216 if match:

217 return match.group(1)

218

219 elif url_type == URLType.DOI:

220 # Extract DOI

221 match = re.search(r"(10\.\d{4,}/[^\s]+)", url)

222 if match: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true

223 return match.group(1)

224

225 return None

226

227 @classmethod

228 def get_source_name(cls, url_type: URLType) -> str:

229 """Get human-readable source name."""

230 names = {

231 URLType.ARXIV: "arXiv",

232 URLType.PUBMED: "PubMed",

233 URLType.PMC: "PubMed Central",

234 URLType.SEMANTIC_SCHOLAR: "Semantic Scholar",

235 URLType.BIORXIV: "bioRxiv",

236 URLType.MEDRXIV: "medRxiv",

237 URLType.DOI: "DOI",

238 URLType.PDF: "PDF",

239 URLType.HTML: "Web Page",

240 URLType.INVALID: "Invalid URL",

241 }

242 return names.get(url_type, "Unknown")

Coverage for src/local_deep_research/content_fetcher/url_classifier.py: 97%

104 statements