Coverage for src / local_deep_research / web_search_engines / engines / search_engine_guardian.py: 95%

241 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from datetime import datetime, timedelta, UTC 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...utilities.search_utilities import remove_think_tags 

8from ...security.safe_requests import safe_get 

9from ..rate_limiting import RateLimitError 

10from ..search_engine_base import BaseSearchEngine 

11 

12 

13class GuardianSearchEngine(BaseSearchEngine): 

14 """Enhanced Guardian API search engine implementation with LLM query optimization""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 

19 def __init__( 

20 self, 

21 max_results: int = 10, 

22 api_key: Optional[str] = None, 

23 from_date: Optional[str] = None, 

24 to_date: Optional[str] = None, 

25 section: Optional[str] = None, 

26 order_by: str = "relevance", 

27 llm: Optional[BaseLLM] = None, 

28 max_filtered_results: Optional[int] = None, 

29 optimize_queries: bool = True, 

30 adaptive_search: bool = True, 

31 settings_snapshot: Optional[Dict[str, Any]] = None, 

32 **kwargs, 

33 ): 

34 """ 

35 Initialize The Guardian search engine with enhanced features. 

36 

37 Args: 

38 max_results: Maximum number of search results 

39 api_key: The Guardian API key (can also be set via LDR_SEARCH_ENGINE_WEB_GUARDIAN_API_KEY env var or in UI settings) 

40 from_date: Start date for search (YYYY-MM-DD format, default 1 month ago) 

41 to_date: End date for search (YYYY-MM-DD format, default today) 

42 section: Filter by section (e.g., "politics", "technology", "sport") 

43 order_by: Sort order ("relevance", "newest", "oldest") 

44 llm: Language model for relevance filtering and query optimization 

45 max_filtered_results: Maximum number of results to keep after filtering 

46 optimize_queries: Whether to optimize queries using LLM 

47 adaptive_search: Whether to use adaptive search (adjusting date ranges) 

48 """ 

49 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

50 super().__init__( 

51 llm=llm, 

52 max_filtered_results=max_filtered_results, 

53 max_results=max_results, 

54 settings_snapshot=settings_snapshot, 

55 ) 

56 

57 # Get API key - check params, settings, or env vars 

58 self.api_key = self._resolve_api_key( 

59 api_key, 

60 "search.engine.web.guardian.api_key", 

61 engine_name="Guardian", 

62 settings_snapshot=settings_snapshot, 

63 ) 

64 self.optimize_queries = optimize_queries 

65 self.adaptive_search = adaptive_search 

66 

67 # Set date ranges if not provided 

68 if not from_date: 

69 # Default to one month ago 

70 one_month_ago = datetime.now(UTC) - timedelta(days=30) 

71 self.from_date = one_month_ago.strftime("%Y-%m-%d") 

72 else: 

73 self.from_date = from_date 

74 

75 if not to_date: 

76 # Default to today 

77 self.to_date = datetime.now(UTC).strftime("%Y-%m-%d") 

78 else: 

79 self.to_date = to_date 

80 

81 self.section = section 

82 self.order_by = order_by 

83 self._original_date_params = { 

84 "from_date": self.from_date, 

85 "to_date": self.to_date, 

86 } 

87 

88 # API base URL 

89 self.api_url = "https://content.guardianapis.com/search" 

90 

91 def _optimize_query_for_guardian(self, query: str) -> str: 

92 """ 

93 Optimize a natural language query for Guardian search. 

94 Uses LLM to transform questions into effective news search queries. 

95 

96 Args: 

97 query: Natural language query 

98 

99 Returns: 

100 Optimized query string for Guardian 

101 """ 

102 # Handle extremely long queries by truncating first 

103 if len(query) > 150: 

104 simple_query = " ".join(query.split()[:10]) 

105 logger.info( 

106 f"Query too long ({len(query)} chars), truncating to: {simple_query}" 

107 ) 

108 query = simple_query 

109 

110 if not self.llm or not self.optimize_queries: 

111 # Return original query if no LLM available or optimization disabled 

112 return query 

113 

114 try: 

115 # Prompt for query optimization 

116 prompt = f"""Transform this natural language question into a very short Guardian news search query. 

117 

118Original query: "{query}" 

119 

120CRITICAL RULES: 

1211. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS 

1222. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total 

1233. Focus only on the main topic/person/event 

1244. Include proper names when relevant 

1255. Remove ALL unnecessary words 

1266. DO NOT use Boolean operators (no AND/OR) 

1277. DO NOT use quotes 

128 

129EXAMPLE CONVERSIONS: 

130✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates" 

131✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations" 

132✓ "How are tech companies responding to AI regulation?" → "tech AI regulation" 

133✓ "What is Donald Trump's current political activity?" → "Trump political activity" 

134 

135Return ONLY the extremely brief search query. 

136""" 

137 

138 # Get response from LLM 

139 response = self.llm.invoke(prompt) 

140 optimized_query = remove_think_tags( 

141 str(response.content) 

142 if hasattr(response, "content") 

143 else str(response) 

144 ).strip() 

145 

146 # Clean up the query - remove any explanations 

147 lines = optimized_query.split("\n") 

148 for line in lines: 

149 line = line.strip() 

150 if line and not line.lower().startswith( 

151 ("here", "i would", "the best", "this query") 

152 ): 

153 optimized_query = line 

154 break 

155 

156 # Remove any quotes that wrap the entire query 

157 if ( 

158 optimized_query.startswith('"') 

159 and optimized_query.endswith('"') 

160 and optimized_query.count('"') == 2 

161 ): 

162 optimized_query = optimized_query[1:-1] 

163 

164 logger.info(f"Original query: '{query}'") 

165 logger.info(f"Optimized for Guardian: '{optimized_query}'") 

166 

167 return optimized_query 

168 

169 except Exception: 

170 logger.exception("Error optimizing query") 

171 return query # Fall back to original query on error 

172 

173 def _adapt_dates_for_query_type(self, query: str) -> None: 

174 """ 

175 Adapt date range based on query type (historical vs current). 

176 

177 Args: 

178 query: The search query 

179 """ 

180 # Fast path - for very short queries, default to recent news 

181 if len(query.split()) <= 4: 

182 logger.info("Short query detected, defaulting to recent news") 

183 # Default to 60 days for short queries 

184 recent = (datetime.now(UTC) - timedelta(days=60)).strftime( 

185 "%Y-%m-%d" 

186 ) 

187 self.from_date = recent 

188 self.order_by = "newest" 

189 return 

190 

191 if not self.llm or not self.adaptive_search: 

192 return 

193 

194 try: 

195 prompt = f"""Is this query asking about HISTORICAL events or CURRENT events? 

196 

197Query: "{query}" 

198 

199ONE WORD ANSWER ONLY: 

200- "HISTORICAL" if about past events (older than 1 year) 

201- "CURRENT" if about recent events (within past year) 

202- "UNCLEAR" if can't determine 

203 

204ONE WORD ONLY:""" 

205 

206 response = self.llm.invoke(prompt) 

207 answer = ( 

208 remove_think_tags( 

209 str(response.content) 

210 if hasattr(response, "content") 

211 else str(response) 

212 ) 

213 .strip() 

214 .upper() 

215 ) 

216 

217 # Reset to original parameters first 

218 self.from_date = self._original_date_params["from_date"] 

219 self.to_date = self._original_date_params["to_date"] 

220 

221 if "HISTORICAL" in answer: 

222 # For historical queries, go back 10 years 

223 logger.info( 

224 "Query classified as HISTORICAL - extending search timeframe" 

225 ) 

226 ten_years_ago = ( 

227 datetime.now(UTC) - timedelta(days=3650) 

228 ).strftime("%Y-%m-%d") 

229 self.from_date = ten_years_ago 

230 

231 elif "CURRENT" in answer: 

232 # For current events, focus on recent content 

233 logger.info( 

234 "Query classified as CURRENT - focusing on recent content" 

235 ) 

236 recent = (datetime.now(UTC) - timedelta(days=60)).strftime( 

237 "%Y-%m-%d" 

238 ) 

239 self.from_date = recent 

240 self.order_by = "newest" # Prioritize newest for current events 

241 

242 except Exception: 

243 logger.exception("Error adapting dates for query type") 

244 # Keep original date parameters on error 

245 

246 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]: 

247 """ 

248 Perform adaptive search that progressively adjusts parameters based on results. 

249 

250 Args: 

251 query: The search query 

252 

253 Returns: 

254 Tuple of (list of articles, search strategy used) 

255 """ 

256 # Try with current parameters 

257 articles = self._get_all_data(query) 

258 strategy = "initial" 

259 

260 # If no results or too few, try different strategies 

261 if len(articles) < 3 and self.adaptive_search: 

262 logger.info( 

263 f"Initial search found only {len(articles)} results, trying alternative strategies" 

264 ) 

265 

266 # Try with expanded date range 

267 original_from_date = self.from_date 

268 original_order_by = self.order_by 

269 

270 # Strategy 1: Expand to 6 months 

271 logger.info("Strategy 1: Expanding time range to 6 months") 

272 six_months_ago = (datetime.now(UTC) - timedelta(days=180)).strftime( 

273 "%Y-%m-%d" 

274 ) 

275 self.from_date = six_months_ago 

276 

277 articles1 = self._get_all_data(query) 

278 if len(articles1) > len(articles): 

279 articles = articles1 

280 strategy = "expanded_6mo" 

281 

282 # Strategy 2: Expand to all time and try relevance order 

283 if len(articles) < 3: 

284 logger.info( 

285 "Strategy 2: Expanding to all time with relevance ordering" 

286 ) 

287 self.from_date = "2000-01-01" # Effectively "all time" 

288 self.order_by = "relevance" 

289 

290 articles2 = self._get_all_data(query) 

291 if len(articles2) > len(articles): 

292 articles = articles2 

293 strategy = "all_time_relevance" 

294 

295 # Strategy 3: Try removing section constraints 

296 if len(articles) < 3 and self.section: 

297 logger.info("Strategy 3: Removing section constraint") 

298 original_section = self.section 

299 self.section = None 

300 

301 articles3 = self._get_all_data(query) 

302 if len(articles3) > len(articles): 302 ↛ 307line 302 didn't jump to line 307 because the condition on line 302 was always true

303 articles = articles3 

304 strategy = "no_section" 

305 

306 # Restore section setting 

307 self.section = original_section 

308 

309 # Restore original settings 

310 self.from_date = original_from_date 

311 self.order_by = original_order_by 

312 

313 logger.info( 

314 f"Adaptive search using strategy '{strategy}' found {len(articles)} results" 

315 ) 

316 return articles, strategy 

317 

318 def _get_all_data(self, query: str) -> List[Dict[str, Any]]: 

319 """ 

320 Get all article data from The Guardian API in a single call. 

321 Always requests all fields for simplicity. 

322 

323 Args: 

324 query: The search query 

325 

326 Returns: 

327 List of articles with all data 

328 """ 

329 try: 

330 # Ensure query is not empty 

331 if not query or query.strip() == "": 

332 query = "news" 

333 logger.warning("Empty query provided, using 'news' as default") 

334 

335 # Ensure query is not too long for API 

336 if len(query) > 100: 

337 logger.warning( 

338 f"Query too long for Guardian API ({len(query)} chars), truncating" 

339 ) 

340 query = query[:100] 

341 

342 # Always request all fields for simplicity 

343 # Ensure max_results is an integer to avoid comparison errors 

344 page_size = min( 

345 int(self.max_results) if self.max_results is not None else 10, 

346 50, 

347 ) 

348 

349 # Log full parameters for debugging 

350 logger.info(f"Guardian API search query: '{query}'") 

351 logger.info( 

352 f"Guardian API date range: {self.from_date} to {self.to_date}" 

353 ) 

354 

355 params = { 

356 "q": query, 

357 "api-key": self.api_key, 

358 "from-date": self.from_date, 

359 "to-date": self.to_date, 

360 "order-by": self.order_by, 

361 "page-size": page_size, # API maximum is 50 

362 "show-fields": "headline,trailText,byline,body,publication", 

363 "show-tags": "keyword", 

364 } 

365 

366 # Add section filter if specified 

367 if self.section: 

368 params["section"] = self.section 

369 

370 # Log the complete request parameters (except API key) 

371 log_params = params.copy() 

372 log_params["api-key"] = "REDACTED" 

373 logger.info(f"Guardian API request parameters: {log_params}") 

374 

375 # Apply rate limiting before request 

376 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

377 self.engine_type 

378 ) 

379 

380 # Execute the API request 

381 response = safe_get(self.api_url, params=params) 

382 response.raise_for_status() 

383 

384 data = response.json() 

385 

386 # Extract results from the response 

387 articles = data.get("response", {}).get("results", []) 

388 logger.info(f"Guardian API returned {len(articles)} articles") 

389 

390 # Format results to include all data 

391 formatted_articles = [] 

392 for i, article in enumerate(articles): 

393 if i >= self.max_results: 

394 break 

395 

396 fields = article.get("fields", {}) 

397 

398 # Format the article with all fields 

399 result = { 

400 "id": article.get("id", ""), 

401 "title": fields.get( 

402 "headline", article.get("webTitle", "") 

403 ), 

404 "link": article.get("webUrl", ""), 

405 "snippet": fields.get("trailText", ""), 

406 "publication_date": article.get("webPublicationDate", ""), 

407 "section": article.get("sectionName", ""), 

408 "author": fields.get("byline", ""), 

409 "content": fields.get("body", ""), 

410 "full_content": fields.get("body", ""), 

411 } 

412 

413 # Extract tags/keywords 

414 tags = article.get("tags", []) 

415 result["keywords"] = [ 

416 tag.get("webTitle", "") 

417 for tag in tags 

418 if tag.get("type") == "keyword" 

419 ] 

420 

421 formatted_articles.append(result) 

422 

423 return formatted_articles 

424 

425 except RateLimitError: 

426 raise 

427 except Exception as e: 

428 sanitized = self._sanitize_error_message(str(e)) 

429 logger.exception( 

430 "Error getting data from The Guardian API: {}", sanitized 

431 ) 

432 self._raise_if_rate_limit(e) 

433 return [] 

434 

435 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

436 """ 

437 Get preview information for Guardian articles with enhanced optimization. 

438 

439 Args: 

440 query: The search query 

441 

442 Returns: 

443 List of preview dictionaries 

444 """ 

445 logger.info( 

446 f"Getting articles from The Guardian API for query: {query}" 

447 ) 

448 

449 # Step 1: Optimize the query using LLM 

450 optimized_query = self._optimize_query_for_guardian(query) 

451 

452 # Step 2: Adapt date parameters based on query type 

453 self._adapt_dates_for_query_type(optimized_query) 

454 

455 # Step 3: Perform adaptive search 

456 articles, strategy = self._adaptive_search(optimized_query) 

457 

458 # Store search metadata for debugging 

459 self._search_metadata = { 

460 "original_query": query, 

461 "optimized_query": optimized_query, 

462 "strategy": strategy, 

463 "from_date": self.from_date, 

464 "to_date": self.to_date, 

465 "section": self.section, 

466 "order_by": self.order_by, 

467 } 

468 

469 # Store full articles for later use 

470 self._full_articles = {a["id"]: a for a in articles} 

471 

472 # Return only preview fields for each article 

473 previews = [] 

474 for article in articles: 

475 preview = { 

476 "id": article["id"], 

477 "title": article["title"], 

478 "link": article["link"], 

479 "snippet": article["snippet"], 

480 "publication_date": article["publication_date"], 

481 "section": article["section"], 

482 "author": article["author"], 

483 "keywords": article.get("keywords", []), 

484 } 

485 previews.append(preview) 

486 

487 return previews 

488 

489 def _get_full_content( 

490 self, relevant_items: List[Dict[str, Any]] 

491 ) -> List[Dict[str, Any]]: 

492 """ 

493 Get full content for the relevant Guardian articles. 

494 Restores full content from the cached data. 

495 

496 Args: 

497 relevant_items: List of relevant preview dictionaries 

498 

499 Returns: 

500 List of result dictionaries with full content 

501 """ 

502 logger.info( 

503 f"Adding full content to {len(relevant_items)} relevant Guardian articles" 

504 ) 

505 

506 # Get full articles for relevant items 

507 results = [] 

508 for item in relevant_items: 

509 article_id = item.get("id", "") 

510 

511 # Get the full article from our cache 

512 if ( 

513 hasattr(self, "_full_articles") 

514 and article_id in self._full_articles 

515 ): 

516 results.append(self._full_articles[article_id]) 

517 else: 

518 # If not found (shouldn't happen), just use the preview 

519 results.append(item) 

520 

521 return results 

522 

523 def run( 

524 self, query: str, research_context: Dict[str, Any] | None = None 

525 ) -> List[Dict[str, Any]]: 

526 """ 

527 Execute a search using The Guardian API with the enhanced approach. 

528 

529 Args: 

530 query: The search query 

531 research_context: Context from previous research to use. 

532 

533 Returns: 

534 List of search results 

535 """ 

536 logger.info("---Execute a search using The Guardian (enhanced)---") 

537 

538 # Additional safety check for None query 

539 if query is None: 

540 logger.error("None query passed to Guardian search engine") 

541 query = "news" 

542 

543 try: 

544 # Get previews with our enhanced method 

545 previews = self._get_previews(query) 

546 

547 # If no results, try one more time with a simplified query 

548 if not previews: 

549 simple_query = " ".join( 

550 [w for w in query.split() if len(w) > 3][:3] 

551 ) 

552 logger.warning( 

553 f"No Guardian articles found, trying simplified query: {simple_query}" 

554 ) 

555 previews = self._get_previews(simple_query) 

556 

557 # If still no results, try with a very generic query as last resort 

558 if not previews and "trump" in query.lower(): 558 ↛ 559line 558 didn't jump to line 559 because the condition on line 558 was never true

559 logger.warning("Trying last resort query: 'Donald Trump'") 

560 previews = self._get_previews("Donald Trump") 

561 elif not previews: 561 ↛ 566line 561 didn't jump to line 566 because the condition on line 561 was always true

562 logger.warning("Trying last resort query: 'news'") 

563 previews = self._get_previews("news") 

564 

565 # If still no results after all attempts, return empty list 

566 if not previews: 

567 logger.warning( 

568 "No Guardian articles found after multiple attempts" 

569 ) 

570 return [] 

571 

572 # Filter for relevance if we have an LLM 

573 if ( 573 ↛ 578line 573 didn't jump to line 578 because the condition on line 573 was never true

574 self.llm 

575 and hasattr(self, "max_filtered_results") 

576 and self.max_filtered_results 

577 ): 

578 filtered_items = self._filter_for_relevance(previews, query) 

579 if not filtered_items: 

580 # Fall back to unfiltered results if everything was filtered out 

581 logger.warning( 

582 "All articles filtered out, using unfiltered results" 

583 ) 

584 filtered_items = previews[: self.max_filtered_results] 

585 else: 

586 filtered_items = previews 

587 

588 # Get full content for relevant items 

589 results = self._get_full_content(filtered_items) 

590 

591 # Add source information to make it clear these are from The Guardian 

592 for result in results: 

593 if "source" not in result: 

594 result["source"] = "The Guardian" 

595 

596 # Clean up the cache after use 

597 if hasattr(self, "_full_articles"): 

598 del self._full_articles 

599 

600 # Restore original date parameters 

601 self.from_date = self._original_date_params["from_date"] 

602 self.to_date = self._original_date_params["to_date"] 

603 

604 # Log search metadata if available 

605 if hasattr(self, "_search_metadata"): 

606 logger.info(f"Search metadata: {self._search_metadata}") 

607 del self._search_metadata 

608 

609 return results 

610 

611 except RateLimitError: 

612 raise 

613 except Exception: 

614 logger.exception("Error in Guardian search") 

615 

616 # Restore original date parameters on error 

617 self.from_date = self._original_date_params["from_date"] 

618 self.to_date = self._original_date_params["to_date"] 

619 

620 return [] 

621 

622 def search_by_section( 

623 self, section: str, max_results: Optional[int] = None 

624 ) -> List[Dict[str, Any]]: 

625 """ 

626 Search for articles in a specific section. 

627 

628 Args: 

629 section: The Guardian section name (e.g., "politics", "technology") 

630 max_results: Maximum number of results (defaults to self.max_results) 

631 

632 Returns: 

633 List of articles in the section 

634 """ 

635 original_section = self.section 

636 original_max_results = self.max_results 

637 

638 try: 

639 # Set section and max_results for this search 

640 self.section = section 

641 if max_results: 641 ↛ 642line 641 didn't jump to line 642 because the condition on line 641 was never true

642 self.max_results = max_results 

643 

644 # Use empty query to get all articles in the section 

645 return self.run("") 

646 

647 finally: 

648 # Restore original values 

649 self.section = original_section 

650 self.max_results = original_max_results 

651 

652 def get_recent_articles( 

653 self, days: int = 7, max_results: Optional[int] = None 

654 ) -> List[Dict[str, Any]]: 

655 """ 

656 Get recent articles from The Guardian. 

657 

658 Args: 

659 days: Number of days to look back 

660 max_results: Maximum number of results (defaults to self.max_results) 

661 

662 Returns: 

663 List of recent articles 

664 """ 

665 original_from_date = self.from_date 

666 original_order_by = self.order_by 

667 original_max_results = self.max_results 

668 

669 try: 

670 # Set parameters for this search 

671 self.from_date = ( 

672 datetime.now(UTC) - timedelta(days=days) 

673 ).strftime("%Y-%m-%d") 

674 self.order_by = "newest" 

675 if max_results: 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true

676 self.max_results = max_results 

677 

678 # Use empty query to get all recent articles 

679 return self.run("") 

680 

681 finally: 

682 # Restore original values 

683 self.from_date = original_from_date 

684 self.order_by = original_order_by 

685 self.max_results = original_max_results