Coverage for src/local_deep_research/research_library/downloaders/extraction/metadata_extractor.py: 99%

154 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Structured metadata extraction using extruct. 

3 

4Extracts JSON-LD, OpenGraph, and microdata from HTML pages. 

5Used to enrich text extraction with structured data — especially 

6useful for product pages, articles, and other schema.org-annotated 

7content where text extraction alone misses key information. 

8""" 

9 

10from typing import Any, Dict, Optional 

11 

12from loguru import logger 

13 

14 

15def extract_metadata(html: str, url: str = "") -> Dict[str, Any]: 

16 """Extract structured metadata from HTML. 

17 

18 Pulls JSON-LD, OpenGraph, and microdata in one pass. 

19 

20 Args: 

21 html: Raw HTML string. 

22 url: Base URL for resolving relative URLs in metadata. 

23 

24 Returns: 

25 Dict with keys: json_ld, opengraph, microdata (each a list of dicts). 

26 Empty lists if nothing found or extruct not installed. 

27 """ 

28 result: Dict[str, Any] = { 

29 "json_ld": [], 

30 "opengraph": [], 

31 "microdata": [], 

32 } 

33 

34 if not html or not html.strip(): 

35 return result 

36 

37 try: 

38 import extruct 

39 except ImportError: 

40 logger.debug("extruct not installed — skipping metadata extraction") 

41 return result 

42 

43 try: 

44 data = extruct.extract( 

45 html, 

46 base_url=url, 

47 syntaxes=["json-ld", "opengraph", "microdata"], 

48 uniform=True, 

49 ) 

50 result["json_ld"] = data.get("json-ld", []) 

51 result["opengraph"] = data.get("opengraph", []) 

52 result["microdata"] = data.get("microdata", []) 

53 except Exception: 

54 logger.debug("extruct metadata extraction failed", exc_info=True) 

55 

56 return result 

57 

58 

59def metadata_to_text(metadata: Dict[str, Any]) -> Optional[str]: 

60 """Convert structured metadata into readable text supplement. 

61 

62 Extracts the most useful fields from JSON-LD, OpenGraph, and 

63 microdata and formats them as a text block that can be appended 

64 to extracted content. 

65 

66 Args: 

67 metadata: Output from extract_metadata(). 

68 

69 Returns: 

70 Formatted text string, or None if no useful metadata found. 

71 """ 

72 parts = [] 

73 

74 # JSON-LD — richest source (Schema.org types) 

75 for item in metadata.get("json_ld", []): 

76 item_type = item.get("@type", "") 

77 if isinstance(item_type, list): 

78 item_type = item_type[0] if item_type else "" 

79 

80 if item_type == "Product": 

81 parts.extend(_format_product(item)) 

82 elif item_type in ( 

83 "Article", 

84 "NewsArticle", 

85 "BlogPosting", 

86 "ScholarlyArticle", 

87 ): 

88 parts.extend(_format_article(item)) 

89 elif item_type == "SoftwareSourceCode": 

90 parts.extend(_format_software(item)) 

91 elif item_type in ("Dataset", "CreativeWork"): 

92 parts.extend(_format_generic(item)) 

93 

94 # Microdata — check for types not covered by JSON-LD 

95 for item in metadata.get("microdata", []): 

96 item_type = item.get("@type", "") 

97 if isinstance(item_type, list): 

98 item_type = item_type[0] if item_type else "" 

99 

100 if item_type == "Product" and not _has_type( 

101 metadata["json_ld"], "Product" 

102 ): 

103 parts.extend(_format_product(item)) 

104 elif item_type == "SoftwareSourceCode" and not _has_type( 

105 metadata["json_ld"], "SoftwareSourceCode" 

106 ): 

107 parts.extend(_format_software(item)) 

108 

109 # OpenGraph — fallback when JSON-LD/microdata don't have structured types 

110 if not parts: 

111 for item in metadata.get("opengraph", []): 

112 og_parts = _format_opengraph(item) 

113 if og_parts: 

114 parts.extend(og_parts) 

115 break # Only use the first OG block 

116 

117 if not parts: 

118 return None 

119 

120 return "\n".join(parts) 

121 

122 

123def _has_type(items: list, type_name: str) -> bool: 

124 """Check if any item in the list has the given @type.""" 

125 for item in items: 

126 t = item.get("@type", "") 

127 if isinstance(t, list): 

128 if type_name in t: 

129 return True 

130 elif t == type_name: 

131 return True 

132 return False 

133 

134 

135def _format_product(item: dict) -> list: 

136 """Format Product schema into readable text.""" 

137 parts = [] 

138 name = item.get("name", "") 

139 if name: 

140 parts.append(f"Product: {name}") 

141 

142 desc = item.get("description", "") 

143 if desc and len(desc) > 20: 

144 parts.append(f"Description: {desc[:500]}") 

145 

146 brand = item.get("brand", "") 

147 if isinstance(brand, dict): 

148 brand = brand.get("name", "") 

149 if brand: 

150 parts.append(f"Brand: {brand}") 

151 

152 # Price 

153 offers = item.get("offers", {}) 

154 if isinstance(offers, list): 

155 offers = offers[0] if offers else {} 

156 if isinstance(offers, dict): 

157 price = offers.get("price", "") 

158 currency = offers.get("priceCurrency", "") 

159 if price: 

160 parts.append( 

161 f"Price: {currency} {price}" if currency else f"Price: {price}" 

162 ) 

163 availability = offers.get("availability", "") 

164 if availability: 

165 parts.append(f"Availability: {availability}") 

166 

167 # Rating 

168 rating = item.get("aggregateRating", {}) 

169 if isinstance(rating, dict): 169 ↛ 175line 169 didn't jump to line 175 because the condition on line 169 was always true

170 value = rating.get("ratingValue", "") 

171 count = rating.get("reviewCount", rating.get("ratingCount", "")) 

172 if value: 

173 parts.append(f"Rating: {value}/5 ({count} reviews)") 

174 

175 return parts 

176 

177 

178def _format_article(item: dict) -> list: 

179 """Format Article schema into readable text.""" 

180 parts = [] 

181 name = item.get("headline", item.get("name", "")) 

182 if name: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true

183 parts.append(f"Article: {name}") 

184 

185 author = item.get("author", "") 

186 if isinstance(author, dict): 

187 author = author.get("name", "") 

188 elif isinstance(author, list): 

189 author = ", ".join( 

190 a.get("name", str(a)) if isinstance(a, dict) else str(a) 

191 for a in author 

192 ) 

193 if author: 

194 parts.append(f"Author: {author}") 

195 

196 date = item.get("datePublished", "") 

197 if date: 

198 parts.append(f"Published: {date}") 

199 

200 body = item.get("articleBody", "") 

201 if body and len(body) > 50: 

202 parts.append(f"Content: {body[:2000]}") 

203 

204 return parts 

205 

206 

207def _format_software(item: dict) -> list: 

208 """Format SoftwareSourceCode schema into readable text.""" 

209 parts = [] 

210 name = item.get("name", "") 

211 if name: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true

212 parts.append(f"Repository: {name}") 

213 

214 author = item.get("author", "") 

215 if isinstance(author, dict): 

216 author = author.get("name", "") 

217 elif isinstance(author, list): 

218 author = ", ".join( 

219 a.get("name", str(a)) if isinstance(a, dict) else str(a) 

220 for a in author 

221 ) 

222 if author: 

223 parts.append(f"Author: {author}") 

224 

225 desc = item.get("description", "") 

226 if desc: 

227 parts.append(f"Description: {desc[:500]}") 

228 

229 text = item.get("text", "") 

230 if text and len(text) > 50: 

231 parts.append(f"Content: {text[:2000]}") 

232 

233 return parts 

234 

235 

236def _format_opengraph(item: dict) -> list: 

237 """Format OpenGraph metadata into readable text.""" 

238 parts = [] 

239 og_type = item.get("@type", item.get("og:type", "")) 

240 title = item.get("og:title", "") 

241 desc = item.get("og:description", "") 

242 site = item.get("og:site_name", "") 

243 

244 # Filter out generic OG types that add no information as prefixes 

245 generic_og_types = {"website", "article", ""} 

246 if title: 

247 prefix = ( 

248 site 

249 if site 

250 else (og_type if og_type.lower() not in generic_og_types else "") 

251 ) 

252 parts.append(f"{prefix}: {title}" if prefix else title) 

253 if desc and len(desc) > 20: 

254 parts.append(f"Description: {desc[:500]}") 

255 

256 # Price tags (some e-commerce sites put these in OG) 

257 price = item.get("product:price:amount", item.get("og:price:amount", "")) 

258 currency = item.get( 

259 "product:price:currency", item.get("og:price:currency", "") 

260 ) 

261 if price: 

262 parts.append(f"Price: {currency} {price}") 

263 

264 return parts 

265 

266 

267def _format_generic(item: dict) -> list: 

268 """Format a generic CreativeWork/Dataset schema.""" 

269 parts = [] 

270 name = item.get("name", item.get("headline", "")) 

271 if name: 

272 parts.append(f"Name: {name}") 

273 

274 desc = item.get("description", "") 

275 if desc and len(desc) > 20: 

276 parts.append(f"Description: {desc[:500]}") 

277 

278 return parts