Coverage for src / local_deep_research / content_fetcher / url_classifier.py: 97%
104 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2URL Classifier for content fetching.
4Determines the type of URL to route to the appropriate downloader.
5"""
7import re
8from enum import Enum
9from typing import Optional
10from urllib.parse import urlparse
13class URLType(Enum):
14 """Types of URLs we can handle."""
16 ARXIV = "arxiv"
17 PUBMED = "pubmed"
18 PMC = "pmc" # PubMed Central
19 SEMANTIC_SCHOLAR = "semantic_scholar"
20 BIORXIV = "biorxiv"
21 MEDRXIV = "medrxiv"
22 DOI = "doi"
23 PDF = "pdf" # Direct PDF link
24 HTML = "html" # Generic web page
25 INVALID = "invalid" # Dangerous/unsupported URL schemes
28# URL schemes that should be rejected (security risk)
29DANGEROUS_SCHEMES = {"javascript", "data", "file", "vbscript", "about"}
32class URLClassifier:
33 """Classifies URLs to determine the appropriate downloader."""
35 # ArXiv patterns
36 ARXIV_PATTERNS = [
37 r"arxiv\.org/abs/",
38 r"arxiv\.org/pdf/",
39 r"arxiv\.org/html/",
40 r"ar5iv\.org/",
41 ]
43 # PubMed patterns
44 PUBMED_PATTERNS = [
45 r"pubmed\.ncbi\.nlm\.nih\.gov/\d+",
46 r"ncbi\.nlm\.nih\.gov/pubmed/\d+",
47 ]
49 # PMC patterns (PubMed Central - full text)
50 # Note: patterns match lowercase since URLs are lowercased before matching
51 PMC_PATTERNS = [
52 r"ncbi\.nlm\.nih\.gov/pmc/articles/pmc",
53 r"europepmc\.org/article/pmc",
54 r"europepmc\.org/articles/pmc",
55 ]
57 # Semantic Scholar patterns
58 SEMANTIC_SCHOLAR_PATTERNS = [
59 r"semanticscholar\.org/paper/",
60 r"api\.semanticscholar\.org/",
61 ]
63 # BioRxiv/MedRxiv patterns
64 BIORXIV_PATTERNS = [
65 r"biorxiv\.org/content/",
66 ]
68 MEDRXIV_PATTERNS = [
69 r"medrxiv\.org/content/",
70 ]
72 # DOI patterns
73 DOI_PATTERNS = [
74 r"doi\.org/10\.",
75 r"dx\.doi\.org/10\.",
76 ]
78 @classmethod
79 def classify(cls, url: str) -> URLType:
80 """
81 Classify a URL to determine its type.
83 Args:
84 url: The URL to classify
86 Returns:
87 URLType enum indicating the type of content
88 """
89 url_lower = url.lower().strip()
91 # Check for dangerous URL schemes first (security)
92 try:
93 parsed = urlparse(url_lower)
94 if parsed.scheme in DANGEROUS_SCHEMES:
95 return URLType.INVALID
96 # Only allow http/https schemes
97 if parsed.scheme and parsed.scheme not in ("http", "https", ""):
98 return URLType.INVALID
99 except Exception:
100 return URLType.INVALID
102 # Check for direct PDF link first
103 if cls._is_pdf_url(url_lower):
104 return URLType.PDF
106 # Check specific academic sources
107 for pattern in cls.ARXIV_PATTERNS:
108 if re.search(pattern, url_lower):
109 return URLType.ARXIV
111 for pattern in cls.PMC_PATTERNS:
112 if re.search(pattern, url_lower):
113 return URLType.PMC
115 for pattern in cls.PUBMED_PATTERNS:
116 if re.search(pattern, url_lower):
117 return URLType.PUBMED
119 for pattern in cls.SEMANTIC_SCHOLAR_PATTERNS:
120 if re.search(pattern, url_lower):
121 return URLType.SEMANTIC_SCHOLAR
123 for pattern in cls.BIORXIV_PATTERNS:
124 if re.search(pattern, url_lower):
125 return URLType.BIORXIV
127 for pattern in cls.MEDRXIV_PATTERNS:
128 if re.search(pattern, url_lower):
129 return URLType.MEDRXIV
131 for pattern in cls.DOI_PATTERNS:
132 if re.search(pattern, url_lower):
133 return URLType.DOI
135 # Default to HTML for web pages
136 return URLType.HTML
138 @classmethod
139 def _is_pdf_url(cls, url: str) -> bool:
140 """Check if URL points directly to a PDF.
142 Note: Academic source PDFs (arXiv, bioRxiv, etc.) are handled
143 by their respective patterns, not as generic PDFs.
144 """
145 url_lower = url.lower()
147 # Academic sources with PDF URLs should use their specialized downloaders
148 academic_domains = [
149 "arxiv.org",
150 "biorxiv.org",
151 "medrxiv.org",
152 "ncbi.nlm.nih.gov",
153 "europepmc.org",
154 "semanticscholar.org",
155 ]
156 if any(domain in url_lower for domain in academic_domains):
157 return False
159 parsed = urlparse(url_lower)
160 path = parsed.path
162 # Check file extension
163 if path.endswith(".pdf"):
164 return True
166 # Check common PDF URL patterns
167 if "/pdf/" in path:
168 return True
170 return False
172 @classmethod
173 def extract_id(
174 cls, url: str, url_type: Optional[URLType] = None
175 ) -> Optional[str]:
176 """
177 Extract the identifier from a URL.
179 Args:
180 url: The URL to extract from
181 url_type: Optional pre-classified type
183 Returns:
184 Extracted ID or None
185 """
186 if url_type is None:
187 url_type = cls.classify(url)
189 if url_type == URLType.ARXIV:
190 # Extract arXiv ID (e.g., 2301.12345 or cond-mat/0501234)
191 match = re.search(r"(\d{4}\.\d{4,5}(?:v\d+)?)", url)
192 if match:
193 return match.group(1)
194 # Old format
195 match = re.search(r"([a-z-]+/\d{7}(?:v\d+)?)", url)
196 if match: 196 ↛ 225line 196 didn't jump to line 225 because the condition on line 196 was always true
197 return match.group(1)
199 elif url_type == URLType.PUBMED:
200 # Extract PMID
201 match = re.search(r"/(\d+)/?", url)
202 if match: 202 ↛ 225line 202 didn't jump to line 225 because the condition on line 202 was always true
203 return match.group(1)
205 elif url_type == URLType.PMC:
206 # Extract PMC ID
207 match = re.search(r"PMC(\d+)", url, re.IGNORECASE)
208 if match:
209 return f"PMC{match.group(1)}"
211 elif url_type == URLType.SEMANTIC_SCHOLAR:
212 # Extract paper ID (40-char hex)
213 match = re.search(
214 r"/paper/(?:[^/]+/)?([a-f0-9]{40})", url, re.IGNORECASE
215 )
216 if match:
217 return match.group(1)
219 elif url_type == URLType.DOI:
220 # Extract DOI
221 match = re.search(r"(10\.\d{4,}/[^\s]+)", url)
222 if match: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true
223 return match.group(1)
225 return None
227 @classmethod
228 def get_source_name(cls, url_type: URLType) -> str:
229 """Get human-readable source name."""
230 names = {
231 URLType.ARXIV: "arXiv",
232 URLType.PUBMED: "PubMed",
233 URLType.PMC: "PubMed Central",
234 URLType.SEMANTIC_SCHOLAR: "Semantic Scholar",
235 URLType.BIORXIV: "bioRxiv",
236 URLType.MEDRXIV: "medRxiv",
237 URLType.DOI: "DOI",
238 URLType.PDF: "PDF",
239 URLType.HTML: "Web Page",
240 URLType.INVALID: "Invalid URL",
241 }
242 return names.get(url_type, "Unknown")