Coverage for src / local_deep_research / research_library / downloaders / extraction / metadata_extractor.py: 68%
154 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Structured metadata extraction using extruct.
4Extracts JSON-LD, OpenGraph, and microdata from HTML pages.
5Used to enrich text extraction with structured data — especially
6useful for product pages, articles, and other schema.org-annotated
7content where text extraction alone misses key information.
8"""
10from typing import Any, Dict, Optional
12from loguru import logger
15def extract_metadata(html: str, url: str = "") -> Dict[str, Any]:
16 """Extract structured metadata from HTML.
18 Pulls JSON-LD, OpenGraph, and microdata in one pass.
20 Args:
21 html: Raw HTML string.
22 url: Base URL for resolving relative URLs in metadata.
24 Returns:
25 Dict with keys: json_ld, opengraph, microdata (each a list of dicts).
26 Empty lists if nothing found or extruct not installed.
27 """
28 result: Dict[str, Any] = {
29 "json_ld": [],
30 "opengraph": [],
31 "microdata": [],
32 }
34 if not html or not html.strip():
35 return result
37 try:
38 import extruct
39 except ImportError:
40 logger.debug("extruct not installed — skipping metadata extraction")
41 return result
43 try:
44 data = extruct.extract(
45 html,
46 base_url=url,
47 syntaxes=["json-ld", "opengraph", "microdata"],
48 uniform=True,
49 )
50 result["json_ld"] = data.get("json-ld", [])
51 result["opengraph"] = data.get("opengraph", [])
52 result["microdata"] = data.get("microdata", [])
53 except Exception:
54 logger.debug("extruct metadata extraction failed", exc_info=True)
56 return result
59def metadata_to_text(metadata: Dict[str, Any]) -> Optional[str]:
60 """Convert structured metadata into readable text supplement.
62 Extracts the most useful fields from JSON-LD, OpenGraph, and
63 microdata and formats them as a text block that can be appended
64 to extracted content.
66 Args:
67 metadata: Output from extract_metadata().
69 Returns:
70 Formatted text string, or None if no useful metadata found.
71 """
72 parts = []
74 # JSON-LD — richest source (Schema.org types)
75 for item in metadata.get("json_ld", []):
76 item_type = item.get("@type", "")
77 if isinstance(item_type, list): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 item_type = item_type[0] if item_type else ""
80 if item_type == "Product":
81 parts.extend(_format_product(item))
82 elif item_type in ( 82 ↛ 89line 82 didn't jump to line 89 because the condition on line 82 was always true
83 "Article",
84 "NewsArticle",
85 "BlogPosting",
86 "ScholarlyArticle",
87 ):
88 parts.extend(_format_article(item))
89 elif item_type == "SoftwareSourceCode":
90 parts.extend(_format_software(item))
91 elif item_type in ("Dataset", "CreativeWork"):
92 parts.extend(_format_generic(item))
94 # Microdata — check for types not covered by JSON-LD
95 for item in metadata.get("microdata", []):
96 item_type = item.get("@type", "")
97 if isinstance(item_type, list): 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 item_type = item_type[0] if item_type else ""
100 if item_type == "Product" and not _has_type( 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was never true
101 metadata["json_ld"], "Product"
102 ):
103 parts.extend(_format_product(item))
104 elif item_type == "SoftwareSourceCode" and not _has_type( 104 ↛ 95line 104 didn't jump to line 95 because the condition on line 104 was always true
105 metadata["json_ld"], "SoftwareSourceCode"
106 ):
107 parts.extend(_format_software(item))
109 # OpenGraph — fallback when JSON-LD/microdata don't have structured types
110 if not parts:
111 for item in metadata.get("opengraph", []):
112 og_parts = _format_opengraph(item)
113 if og_parts:
114 parts.extend(og_parts)
115 break # Only use the first OG block
117 if not parts:
118 return None
120 return "\n".join(parts)
123def _has_type(items: list, type_name: str) -> bool:
124 """Check if any item in the list has the given @type."""
125 for item in items: 125 ↛ 126line 125 didn't jump to line 126 because the loop on line 125 never started
126 t = item.get("@type", "")
127 if isinstance(t, list):
128 if type_name in t:
129 return True
130 elif t == type_name:
131 return True
132 return False
135def _format_product(item: dict) -> list:
136 """Format Product schema into readable text."""
137 parts = []
138 name = item.get("name", "")
139 if name: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was always true
140 parts.append(f"Product: {name}")
142 desc = item.get("description", "")
143 if desc and len(desc) > 20: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true
144 parts.append(f"Description: {desc[:500]}")
146 brand = item.get("brand", "")
147 if isinstance(brand, dict): 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true
148 brand = brand.get("name", "")
149 if brand: 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true
150 parts.append(f"Brand: {brand}")
152 # Price
153 offers = item.get("offers", {})
154 if isinstance(offers, list): 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 offers = offers[0] if offers else {}
156 if isinstance(offers, dict): 156 ↛ 168line 156 didn't jump to line 168 because the condition on line 156 was always true
157 price = offers.get("price", "")
158 currency = offers.get("priceCurrency", "")
159 if price: 159 ↛ 163line 159 didn't jump to line 163 because the condition on line 159 was always true
160 parts.append(
161 f"Price: {currency} {price}" if currency else f"Price: {price}"
162 )
163 availability = offers.get("availability", "")
164 if availability: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true
165 parts.append(f"Availability: {availability}")
167 # Rating
168 rating = item.get("aggregateRating", {})
169 if isinstance(rating, dict): 169 ↛ 175line 169 didn't jump to line 175 because the condition on line 169 was always true
170 value = rating.get("ratingValue", "")
171 count = rating.get("reviewCount", rating.get("ratingCount", ""))
172 if value: 172 ↛ 175line 172 didn't jump to line 175 because the condition on line 172 was always true
173 parts.append(f"Rating: {value}/5 ({count} reviews)")
175 return parts
178def _format_article(item: dict) -> list:
179 """Format Article schema into readable text."""
180 parts = []
181 name = item.get("headline", item.get("name", ""))
182 if name: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true
183 parts.append(f"Article: {name}")
185 author = item.get("author", "")
186 if isinstance(author, dict): 186 ↛ 188line 186 didn't jump to line 188 because the condition on line 186 was always true
187 author = author.get("name", "")
188 elif isinstance(author, list):
189 author = ", ".join(
190 a.get("name", str(a)) if isinstance(a, dict) else str(a)
191 for a in author
192 )
193 if author: 193 ↛ 196line 193 didn't jump to line 196 because the condition on line 193 was always true
194 parts.append(f"Author: {author}")
196 date = item.get("datePublished", "")
197 if date: 197 ↛ 200line 197 didn't jump to line 200 because the condition on line 197 was always true
198 parts.append(f"Published: {date}")
200 body = item.get("articleBody", "")
201 if body and len(body) > 50: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true
202 parts.append(f"Content: {body[:2000]}")
204 return parts
207def _format_software(item: dict) -> list:
208 """Format SoftwareSourceCode schema into readable text."""
209 parts = []
210 name = item.get("name", "")
211 if name: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true
212 parts.append(f"Repository: {name}")
214 author = item.get("author", "")
215 if isinstance(author, dict): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 author = author.get("name", "")
217 elif isinstance(author, list): 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 author = ", ".join(
219 a.get("name", str(a)) if isinstance(a, dict) else str(a)
220 for a in author
221 )
222 if author: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true
223 parts.append(f"Author: {author}")
225 desc = item.get("description", "")
226 if desc: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 parts.append(f"Description: {desc[:500]}")
229 text = item.get("text", "")
230 if text and len(text) > 50: 230 ↛ 233line 230 didn't jump to line 233 because the condition on line 230 was always true
231 parts.append(f"Content: {text[:2000]}")
233 return parts
236def _format_opengraph(item: dict) -> list:
237 """Format OpenGraph metadata into readable text."""
238 parts = []
239 og_type = item.get("@type", item.get("og:type", ""))
240 title = item.get("og:title", "")
241 desc = item.get("og:description", "")
242 site = item.get("og:site_name", "")
244 # Filter out generic OG types that add no information as prefixes
245 generic_og_types = {"website", "article", ""}
246 if title:
247 prefix = (
248 site
249 if site
250 else (og_type if og_type.lower() not in generic_og_types else "")
251 )
252 parts.append(f"{prefix}: {title}" if prefix else title)
253 if desc and len(desc) > 20:
254 parts.append(f"Description: {desc[:500]}")
256 # Price tags (some e-commerce sites put these in OG)
257 price = item.get("product:price:amount", item.get("og:price:amount", ""))
258 currency = item.get(
259 "product:price:currency", item.get("og:price:currency", "")
260 )
261 if price: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 parts.append(f"Price: {currency} {price}")
264 return parts
267def _format_generic(item: dict) -> list:
268 """Format a generic CreativeWork/Dataset schema."""
269 parts = []
270 name = item.get("name", item.get("headline", ""))
271 if name:
272 parts.append(f"Name: {name}")
274 desc = item.get("description", "")
275 if desc and len(desc) > 20:
276 parts.append(f"Description: {desc[:500]}")
278 return parts