Coverage for src / local_deep_research / research_library / downloaders / extraction / metadata_extractor.py: 68%

154 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Structured metadata extraction using extruct. 

3 

4Extracts JSON-LD, OpenGraph, and microdata from HTML pages. 

5Used to enrich text extraction with structured data — especially 

6useful for product pages, articles, and other schema.org-annotated 

7content where text extraction alone misses key information. 

8""" 

9 

10from typing import Any, Dict, Optional 

11 

12from loguru import logger 

13 

14 

15def extract_metadata(html: str, url: str = "") -> Dict[str, Any]: 

16 """Extract structured metadata from HTML. 

17 

18 Pulls JSON-LD, OpenGraph, and microdata in one pass. 

19 

20 Args: 

21 html: Raw HTML string. 

22 url: Base URL for resolving relative URLs in metadata. 

23 

24 Returns: 

25 Dict with keys: json_ld, opengraph, microdata (each a list of dicts). 

26 Empty lists if nothing found or extruct not installed. 

27 """ 

28 result: Dict[str, Any] = { 

29 "json_ld": [], 

30 "opengraph": [], 

31 "microdata": [], 

32 } 

33 

34 if not html or not html.strip(): 

35 return result 

36 

37 try: 

38 import extruct 

39 except ImportError: 

40 logger.debug("extruct not installed — skipping metadata extraction") 

41 return result 

42 

43 try: 

44 data = extruct.extract( 

45 html, 

46 base_url=url, 

47 syntaxes=["json-ld", "opengraph", "microdata"], 

48 uniform=True, 

49 ) 

50 result["json_ld"] = data.get("json-ld", []) 

51 result["opengraph"] = data.get("opengraph", []) 

52 result["microdata"] = data.get("microdata", []) 

53 except Exception: 

54 logger.debug("extruct metadata extraction failed", exc_info=True) 

55 

56 return result 

57 

58 

59def metadata_to_text(metadata: Dict[str, Any]) -> Optional[str]: 

60 """Convert structured metadata into readable text supplement. 

61 

62 Extracts the most useful fields from JSON-LD, OpenGraph, and 

63 microdata and formats them as a text block that can be appended 

64 to extracted content. 

65 

66 Args: 

67 metadata: Output from extract_metadata(). 

68 

69 Returns: 

70 Formatted text string, or None if no useful metadata found. 

71 """ 

72 parts = [] 

73 

74 # JSON-LD — richest source (Schema.org types) 

75 for item in metadata.get("json_ld", []): 

76 item_type = item.get("@type", "") 

77 if isinstance(item_type, list): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 item_type = item_type[0] if item_type else "" 

79 

80 if item_type == "Product": 

81 parts.extend(_format_product(item)) 

82 elif item_type in ( 82 ↛ 89line 82 didn't jump to line 89 because the condition on line 82 was always true

83 "Article", 

84 "NewsArticle", 

85 "BlogPosting", 

86 "ScholarlyArticle", 

87 ): 

88 parts.extend(_format_article(item)) 

89 elif item_type == "SoftwareSourceCode": 

90 parts.extend(_format_software(item)) 

91 elif item_type in ("Dataset", "CreativeWork"): 

92 parts.extend(_format_generic(item)) 

93 

94 # Microdata — check for types not covered by JSON-LD 

95 for item in metadata.get("microdata", []): 

96 item_type = item.get("@type", "") 

97 if isinstance(item_type, list): 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 item_type = item_type[0] if item_type else "" 

99 

100 if item_type == "Product" and not _has_type( 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was never true

101 metadata["json_ld"], "Product" 

102 ): 

103 parts.extend(_format_product(item)) 

104 elif item_type == "SoftwareSourceCode" and not _has_type( 104 ↛ 95line 104 didn't jump to line 95 because the condition on line 104 was always true

105 metadata["json_ld"], "SoftwareSourceCode" 

106 ): 

107 parts.extend(_format_software(item)) 

108 

109 # OpenGraph — fallback when JSON-LD/microdata don't have structured types 

110 if not parts: 

111 for item in metadata.get("opengraph", []): 

112 og_parts = _format_opengraph(item) 

113 if og_parts: 

114 parts.extend(og_parts) 

115 break # Only use the first OG block 

116 

117 if not parts: 

118 return None 

119 

120 return "\n".join(parts) 

121 

122 

123def _has_type(items: list, type_name: str) -> bool: 

124 """Check if any item in the list has the given @type.""" 

125 for item in items: 125 ↛ 126line 125 didn't jump to line 126 because the loop on line 125 never started

126 t = item.get("@type", "") 

127 if isinstance(t, list): 

128 if type_name in t: 

129 return True 

130 elif t == type_name: 

131 return True 

132 return False 

133 

134 

135def _format_product(item: dict) -> list: 

136 """Format Product schema into readable text.""" 

137 parts = [] 

138 name = item.get("name", "") 

139 if name: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was always true

140 parts.append(f"Product: {name}") 

141 

142 desc = item.get("description", "") 

143 if desc and len(desc) > 20: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true

144 parts.append(f"Description: {desc[:500]}") 

145 

146 brand = item.get("brand", "") 

147 if isinstance(brand, dict): 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true

148 brand = brand.get("name", "") 

149 if brand: 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true

150 parts.append(f"Brand: {brand}") 

151 

152 # Price 

153 offers = item.get("offers", {}) 

154 if isinstance(offers, list): 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 offers = offers[0] if offers else {} 

156 if isinstance(offers, dict): 156 ↛ 168line 156 didn't jump to line 168 because the condition on line 156 was always true

157 price = offers.get("price", "") 

158 currency = offers.get("priceCurrency", "") 

159 if price: 159 ↛ 163line 159 didn't jump to line 163 because the condition on line 159 was always true

160 parts.append( 

161 f"Price: {currency} {price}" if currency else f"Price: {price}" 

162 ) 

163 availability = offers.get("availability", "") 

164 if availability: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 parts.append(f"Availability: {availability}") 

166 

167 # Rating 

168 rating = item.get("aggregateRating", {}) 

169 if isinstance(rating, dict): 169 ↛ 175line 169 didn't jump to line 175 because the condition on line 169 was always true

170 value = rating.get("ratingValue", "") 

171 count = rating.get("reviewCount", rating.get("ratingCount", "")) 

172 if value: 172 ↛ 175line 172 didn't jump to line 175 because the condition on line 172 was always true

173 parts.append(f"Rating: {value}/5 ({count} reviews)") 

174 

175 return parts 

176 

177 

178def _format_article(item: dict) -> list: 

179 """Format Article schema into readable text.""" 

180 parts = [] 

181 name = item.get("headline", item.get("name", "")) 

182 if name: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true

183 parts.append(f"Article: {name}") 

184 

185 author = item.get("author", "") 

186 if isinstance(author, dict): 186 ↛ 188line 186 didn't jump to line 188 because the condition on line 186 was always true

187 author = author.get("name", "") 

188 elif isinstance(author, list): 

189 author = ", ".join( 

190 a.get("name", str(a)) if isinstance(a, dict) else str(a) 

191 for a in author 

192 ) 

193 if author: 193 ↛ 196line 193 didn't jump to line 196 because the condition on line 193 was always true

194 parts.append(f"Author: {author}") 

195 

196 date = item.get("datePublished", "") 

197 if date: 197 ↛ 200line 197 didn't jump to line 200 because the condition on line 197 was always true

198 parts.append(f"Published: {date}") 

199 

200 body = item.get("articleBody", "") 

201 if body and len(body) > 50: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 parts.append(f"Content: {body[:2000]}") 

203 

204 return parts 

205 

206 

207def _format_software(item: dict) -> list: 

208 """Format SoftwareSourceCode schema into readable text.""" 

209 parts = [] 

210 name = item.get("name", "") 

211 if name: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true

212 parts.append(f"Repository: {name}") 

213 

214 author = item.get("author", "") 

215 if isinstance(author, dict): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 author = author.get("name", "") 

217 elif isinstance(author, list): 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 author = ", ".join( 

219 a.get("name", str(a)) if isinstance(a, dict) else str(a) 

220 for a in author 

221 ) 

222 if author: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true

223 parts.append(f"Author: {author}") 

224 

225 desc = item.get("description", "") 

226 if desc: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 parts.append(f"Description: {desc[:500]}") 

228 

229 text = item.get("text", "") 

230 if text and len(text) > 50: 230 ↛ 233line 230 didn't jump to line 233 because the condition on line 230 was always true

231 parts.append(f"Content: {text[:2000]}") 

232 

233 return parts 

234 

235 

236def _format_opengraph(item: dict) -> list: 

237 """Format OpenGraph metadata into readable text.""" 

238 parts = [] 

239 og_type = item.get("@type", item.get("og:type", "")) 

240 title = item.get("og:title", "") 

241 desc = item.get("og:description", "") 

242 site = item.get("og:site_name", "") 

243 

244 # Filter out generic OG types that add no information as prefixes 

245 generic_og_types = {"website", "article", ""} 

246 if title: 

247 prefix = ( 

248 site 

249 if site 

250 else (og_type if og_type.lower() not in generic_og_types else "") 

251 ) 

252 parts.append(f"{prefix}: {title}" if prefix else title) 

253 if desc and len(desc) > 20: 

254 parts.append(f"Description: {desc[:500]}") 

255 

256 # Price tags (some e-commerce sites put these in OG) 

257 price = item.get("product:price:amount", item.get("og:price:amount", "")) 

258 currency = item.get( 

259 "product:price:currency", item.get("og:price:currency", "") 

260 ) 

261 if price: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 parts.append(f"Price: {currency} {price}") 

263 

264 return parts 

265 

266 

267def _format_generic(item: dict) -> list: 

268 """Format a generic CreativeWork/Dataset schema.""" 

269 parts = [] 

270 name = item.get("name", item.get("headline", "")) 

271 if name: 

272 parts.append(f"Name: {name}") 

273 

274 desc = item.get("description", "") 

275 if desc and len(desc) > 20: 

276 parts.append(f"Description: {desc[:500]}") 

277 

278 return parts