Coverage for src/local_deep_research/research_library/downloaders/extraction/metadata

1"""

2Structured metadata extraction using extruct.

4Extracts JSON-LD, OpenGraph, and microdata from HTML pages.

5Used to enrich text extraction with structured data — especially

6useful for product pages, articles, and other schema.org-annotated

7content where text extraction alone misses key information.

8"""

10from typing import Any, Dict, Optional

12from loguru import logger

15def extract_metadata(html: str, url: str = "") -> Dict[str, Any]:

16 """Extract structured metadata from HTML.

18 Pulls JSON-LD, OpenGraph, and microdata in one pass.

20 Args:

21 html: Raw HTML string.

22 url: Base URL for resolving relative URLs in metadata.

24 Returns:

25 Dict with keys: json_ld, opengraph, microdata (each a list of dicts).

26 Empty lists if nothing found or extruct not installed.

27 """

28 result: Dict[str, Any] = {

29 "json_ld": [],

30 "opengraph": [],

31 "microdata": [],

32 }

34 if not html or not html.strip():

35 return result

37 try:

38 import extruct

39 except ImportError:

40 logger.debug("extruct not installed — skipping metadata extraction")

41 return result

43 try:

44 data = extruct.extract(

45 html,

46 base_url=url,

47 syntaxes=["json-ld", "opengraph", "microdata"],

48 uniform=True,

49 )

50 result["json_ld"] = data.get("json-ld", [])

51 result["opengraph"] = data.get("opengraph", [])

52 result["microdata"] = data.get("microdata", [])

53 except Exception:

54 logger.debug("extruct metadata extraction failed", exc_info=True)

56 return result

59def metadata_to_text(metadata: Dict[str, Any]) -> Optional[str]:

60 """Convert structured metadata into readable text supplement.

62 Extracts the most useful fields from JSON-LD, OpenGraph, and

63 microdata and formats them as a text block that can be appended

64 to extracted content.

66 Args:

67 metadata: Output from extract_metadata().

69 Returns:

70 Formatted text string, or None if no useful metadata found.

71 """

72 parts = []

74 # JSON-LD — richest source (Schema.org types)

75 for item in metadata.get("json_ld", []):

76 item_type = item.get("@type", "")

77 if isinstance(item_type, list):

78 item_type = item_type[0] if item_type else ""

80 if item_type == "Product":

81 parts.extend(_format_product(item))

82 elif item_type in (

83 "Article",

84 "NewsArticle",

85 "BlogPosting",

86 "ScholarlyArticle",

87 ):

88 parts.extend(_format_article(item))

89 elif item_type == "SoftwareSourceCode":

90 parts.extend(_format_software(item))

91 elif item_type in ("Dataset", "CreativeWork"):

92 parts.extend(_format_generic(item))

94 # Microdata — check for types not covered by JSON-LD

95 for item in metadata.get("microdata", []):

96 item_type = item.get("@type", "")

97 if isinstance(item_type, list):

98 item_type = item_type[0] if item_type else ""

100 if item_type == "Product" and not _has_type(

101 metadata["json_ld"], "Product"

102 ):

103 parts.extend(_format_product(item))

104 elif item_type == "SoftwareSourceCode" and not _has_type(

105 metadata["json_ld"], "SoftwareSourceCode"

106 ):

107 parts.extend(_format_software(item))

108

109 # OpenGraph — fallback when JSON-LD/microdata don't have structured types

110 if not parts:

111 for item in metadata.get("opengraph", []):

112 og_parts = _format_opengraph(item)

113 if og_parts:

114 parts.extend(og_parts)

115 break # Only use the first OG block

116

117 if not parts:

118 return None

119

120 return "\n".join(parts)

121

122

123def _has_type(items: list, type_name: str) -> bool:

124 """Check if any item in the list has the given @type."""

125 for item in items:

126 t = item.get("@type", "")

127 if isinstance(t, list):

128 if type_name in t:

129 return True

130 elif t == type_name:

131 return True

132 return False

133

134

135def _format_product(item: dict) -> list:

136 """Format Product schema into readable text."""

137 parts = []

138 name = item.get("name", "")

139 if name:

140 parts.append(f"Product: {name}")

141

142 desc = item.get("description", "")

143 if desc and len(desc) > 20:

144 parts.append(f"Description: {desc[:500]}")

145

146 brand = item.get("brand", "")

147 if isinstance(brand, dict):

148 brand = brand.get("name", "")

149 if brand:

150 parts.append(f"Brand: {brand}")

151

152 # Price

153 offers = item.get("offers", {})

154 if isinstance(offers, list):

155 offers = offers[0] if offers else {}

156 if isinstance(offers, dict):

157 price = offers.get("price", "")

158 currency = offers.get("priceCurrency", "")

159 if price:

160 parts.append(

161 f"Price: {currency} {price}" if currency else f"Price: {price}"

162 )

163 availability = offers.get("availability", "")

164 if availability:

165 parts.append(f"Availability: {availability}")

166

167 # Rating

168 rating = item.get("aggregateRating", {})

169 if isinstance(rating, dict): 169 ↛ 175line 169 didn't jump to line 175 because the condition on line 169 was always true

170 value = rating.get("ratingValue", "")

171 count = rating.get("reviewCount", rating.get("ratingCount", ""))

172 if value:

173 parts.append(f"Rating: {value}/5 ({count} reviews)")

174

175 return parts

176

177

178def _format_article(item: dict) -> list:

179 """Format Article schema into readable text."""

180 parts = []

181 name = item.get("headline", item.get("name", ""))

182 if name: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true

183 parts.append(f"Article: {name}")

184

185 author = item.get("author", "")

186 if isinstance(author, dict):

187 author = author.get("name", "")

188 elif isinstance(author, list):

189 author = ", ".join(

190 a.get("name", str(a)) if isinstance(a, dict) else str(a)

191 for a in author

192 )

193 if author:

194 parts.append(f"Author: {author}")

195

196 date = item.get("datePublished", "")

197 if date:

198 parts.append(f"Published: {date}")

199

200 body = item.get("articleBody", "")

201 if body and len(body) > 50:

202 parts.append(f"Content: {body[:2000]}")

203

204 return parts

205

206

207def _format_software(item: dict) -> list:

208 """Format SoftwareSourceCode schema into readable text."""

209 parts = []

210 name = item.get("name", "")

211 if name: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true

212 parts.append(f"Repository: {name}")

213

214 author = item.get("author", "")

215 if isinstance(author, dict):

216 author = author.get("name", "")

217 elif isinstance(author, list):

218 author = ", ".join(

219 a.get("name", str(a)) if isinstance(a, dict) else str(a)

220 for a in author

221 )

222 if author:

223 parts.append(f"Author: {author}")

224

225 desc = item.get("description", "")

226 if desc:

227 parts.append(f"Description: {desc[:500]}")

228

229 text = item.get("text", "")

230 if text and len(text) > 50:

231 parts.append(f"Content: {text[:2000]}")

232

233 return parts

234

235

236def _format_opengraph(item: dict) -> list:

237 """Format OpenGraph metadata into readable text."""

238 parts = []

239 og_type = item.get("@type", item.get("og:type", ""))

240 title = item.get("og:title", "")

241 desc = item.get("og:description", "")

242 site = item.get("og:site_name", "")

243

244 # Filter out generic OG types that add no information as prefixes

245 generic_og_types = {"website", "article", ""}

246 if title:

247 prefix = (

248 site

249 if site

250 else (og_type if og_type.lower() not in generic_og_types else "")

251 )

252 parts.append(f"{prefix}: {title}" if prefix else title)

253 if desc and len(desc) > 20:

254 parts.append(f"Description: {desc[:500]}")

255

256 # Price tags (some e-commerce sites put these in OG)

257 price = item.get("product:price:amount", item.get("og:price:amount", ""))

258 currency = item.get(

259 "product:price:currency", item.get("og:price:currency", "")

260 )

261 if price:

262 parts.append(f"Price: {currency} {price}")

263

264 return parts

265

266

267def _format_generic(item: dict) -> list:

268 """Format a generic CreativeWork/Dataset schema."""

269 parts = []

270 name = item.get("name", item.get("headline", ""))

271 if name:

272 parts.append(f"Name: {name}")

273

274 desc = item.get("description", "")

275 if desc and len(desc) > 20:

276 parts.append(f"Description: {desc[:500]}")

277

278 return parts

Coverage for src/local_deep_research/research_library/downloaders/extraction/metadata_extractor.py: 99%

154 statements