Coverage for src / local_deep_research / content_fetcher / url_classifier.py: 97%

104 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2URL Classifier for content fetching. 

3 

4Determines the type of URL to route to the appropriate downloader. 

5""" 

6 

7import re 

8from enum import Enum 

9from typing import Optional 

10from urllib.parse import urlparse 

11 

12 

13class URLType(Enum): 

14 """Types of URLs we can handle.""" 

15 

16 ARXIV = "arxiv" 

17 PUBMED = "pubmed" 

18 PMC = "pmc" # PubMed Central 

19 SEMANTIC_SCHOLAR = "semantic_scholar" 

20 BIORXIV = "biorxiv" 

21 MEDRXIV = "medrxiv" 

22 DOI = "doi" 

23 PDF = "pdf" # Direct PDF link 

24 HTML = "html" # Generic web page 

25 INVALID = "invalid" # Dangerous/unsupported URL schemes 

26 

27 

28# URL schemes that should be rejected (security risk) 

29DANGEROUS_SCHEMES = {"javascript", "data", "file", "vbscript", "about"} 

30 

31 

32class URLClassifier: 

33 """Classifies URLs to determine the appropriate downloader.""" 

34 

35 # ArXiv patterns 

36 ARXIV_PATTERNS = [ 

37 r"arxiv\.org/abs/", 

38 r"arxiv\.org/pdf/", 

39 r"arxiv\.org/html/", 

40 r"ar5iv\.org/", 

41 ] 

42 

43 # PubMed patterns 

44 PUBMED_PATTERNS = [ 

45 r"pubmed\.ncbi\.nlm\.nih\.gov/\d+", 

46 r"ncbi\.nlm\.nih\.gov/pubmed/\d+", 

47 ] 

48 

49 # PMC patterns (PubMed Central - full text) 

50 # Note: patterns match lowercase since URLs are lowercased before matching 

51 PMC_PATTERNS = [ 

52 r"ncbi\.nlm\.nih\.gov/pmc/articles/pmc", 

53 r"europepmc\.org/article/pmc", 

54 r"europepmc\.org/articles/pmc", 

55 ] 

56 

57 # Semantic Scholar patterns 

58 SEMANTIC_SCHOLAR_PATTERNS = [ 

59 r"semanticscholar\.org/paper/", 

60 r"api\.semanticscholar\.org/", 

61 ] 

62 

63 # BioRxiv/MedRxiv patterns 

64 BIORXIV_PATTERNS = [ 

65 r"biorxiv\.org/content/", 

66 ] 

67 

68 MEDRXIV_PATTERNS = [ 

69 r"medrxiv\.org/content/", 

70 ] 

71 

72 # DOI patterns 

73 DOI_PATTERNS = [ 

74 r"doi\.org/10\.", 

75 r"dx\.doi\.org/10\.", 

76 ] 

77 

78 @classmethod 

79 def classify(cls, url: str) -> URLType: 

80 """ 

81 Classify a URL to determine its type. 

82 

83 Args: 

84 url: The URL to classify 

85 

86 Returns: 

87 URLType enum indicating the type of content 

88 """ 

89 url_lower = url.lower().strip() 

90 

91 # Check for dangerous URL schemes first (security) 

92 try: 

93 parsed = urlparse(url_lower) 

94 if parsed.scheme in DANGEROUS_SCHEMES: 

95 return URLType.INVALID 

96 # Only allow http/https schemes 

97 if parsed.scheme and parsed.scheme not in ("http", "https", ""): 

98 return URLType.INVALID 

99 except Exception: 

100 return URLType.INVALID 

101 

102 # Check for direct PDF link first 

103 if cls._is_pdf_url(url_lower): 

104 return URLType.PDF 

105 

106 # Check specific academic sources 

107 for pattern in cls.ARXIV_PATTERNS: 

108 if re.search(pattern, url_lower): 

109 return URLType.ARXIV 

110 

111 for pattern in cls.PMC_PATTERNS: 

112 if re.search(pattern, url_lower): 

113 return URLType.PMC 

114 

115 for pattern in cls.PUBMED_PATTERNS: 

116 if re.search(pattern, url_lower): 

117 return URLType.PUBMED 

118 

119 for pattern in cls.SEMANTIC_SCHOLAR_PATTERNS: 

120 if re.search(pattern, url_lower): 

121 return URLType.SEMANTIC_SCHOLAR 

122 

123 for pattern in cls.BIORXIV_PATTERNS: 

124 if re.search(pattern, url_lower): 

125 return URLType.BIORXIV 

126 

127 for pattern in cls.MEDRXIV_PATTERNS: 

128 if re.search(pattern, url_lower): 

129 return URLType.MEDRXIV 

130 

131 for pattern in cls.DOI_PATTERNS: 

132 if re.search(pattern, url_lower): 

133 return URLType.DOI 

134 

135 # Default to HTML for web pages 

136 return URLType.HTML 

137 

138 @classmethod 

139 def _is_pdf_url(cls, url: str) -> bool: 

140 """Check if URL points directly to a PDF. 

141 

142 Note: Academic source PDFs (arXiv, bioRxiv, etc.) are handled 

143 by their respective patterns, not as generic PDFs. 

144 """ 

145 url_lower = url.lower() 

146 

147 # Academic sources with PDF URLs should use their specialized downloaders 

148 academic_domains = [ 

149 "arxiv.org", 

150 "biorxiv.org", 

151 "medrxiv.org", 

152 "ncbi.nlm.nih.gov", 

153 "europepmc.org", 

154 "semanticscholar.org", 

155 ] 

156 if any(domain in url_lower for domain in academic_domains): 

157 return False 

158 

159 parsed = urlparse(url_lower) 

160 path = parsed.path 

161 

162 # Check file extension 

163 if path.endswith(".pdf"): 

164 return True 

165 

166 # Check common PDF URL patterns 

167 if "/pdf/" in path: 

168 return True 

169 

170 return False 

171 

172 @classmethod 

173 def extract_id( 

174 cls, url: str, url_type: Optional[URLType] = None 

175 ) -> Optional[str]: 

176 """ 

177 Extract the identifier from a URL. 

178 

179 Args: 

180 url: The URL to extract from 

181 url_type: Optional pre-classified type 

182 

183 Returns: 

184 Extracted ID or None 

185 """ 

186 if url_type is None: 

187 url_type = cls.classify(url) 

188 

189 if url_type == URLType.ARXIV: 

190 # Extract arXiv ID (e.g., 2301.12345 or cond-mat/0501234) 

191 match = re.search(r"(\d{4}\.\d{4,5}(?:v\d+)?)", url) 

192 if match: 

193 return match.group(1) 

194 # Old format 

195 match = re.search(r"([a-z-]+/\d{7}(?:v\d+)?)", url) 

196 if match: 196 ↛ 225line 196 didn't jump to line 225 because the condition on line 196 was always true

197 return match.group(1) 

198 

199 elif url_type == URLType.PUBMED: 

200 # Extract PMID 

201 match = re.search(r"/(\d+)/?", url) 

202 if match: 202 ↛ 225line 202 didn't jump to line 225 because the condition on line 202 was always true

203 return match.group(1) 

204 

205 elif url_type == URLType.PMC: 

206 # Extract PMC ID 

207 match = re.search(r"PMC(\d+)", url, re.IGNORECASE) 

208 if match: 

209 return f"PMC{match.group(1)}" 

210 

211 elif url_type == URLType.SEMANTIC_SCHOLAR: 

212 # Extract paper ID (40-char hex) 

213 match = re.search( 

214 r"/paper/(?:[^/]+/)?([a-f0-9]{40})", url, re.IGNORECASE 

215 ) 

216 if match: 

217 return match.group(1) 

218 

219 elif url_type == URLType.DOI: 

220 # Extract DOI 

221 match = re.search(r"(10\.\d{4,}/[^\s]+)", url) 

222 if match: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true

223 return match.group(1) 

224 

225 return None 

226 

227 @classmethod 

228 def get_source_name(cls, url_type: URLType) -> str: 

229 """Get human-readable source name.""" 

230 names = { 

231 URLType.ARXIV: "arXiv", 

232 URLType.PUBMED: "PubMed", 

233 URLType.PMC: "PubMed Central", 

234 URLType.SEMANTIC_SCHOLAR: "Semantic Scholar", 

235 URLType.BIORXIV: "bioRxiv", 

236 URLType.MEDRXIV: "medRxiv", 

237 URLType.DOI: "DOI", 

238 URLType.PDF: "PDF", 

239 URLType.HTML: "Web Page", 

240 URLType.INVALID: "Invalid URL", 

241 } 

242 return names.get(url_type, "Unknown")