Coverage for src / local_deep_research / web_search_engines / engines / search_engine_guardian.py: 44%

243 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from datetime import datetime, timedelta, UTC 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...config import search_config 

8from ...config.search_config import get_setting_from_snapshot 

9from ...utilities.search_utilities import remove_think_tags 

10from ...security.safe_requests import safe_get 

11from ..search_engine_base import BaseSearchEngine 

12 

13 

14class GuardianSearchEngine(BaseSearchEngine): 

15 """Enhanced Guardian API search engine implementation with LLM query optimization""" 

16 

17 # Mark as public search engine 

18 is_public = True 

19 

20 def __init__( 

21 self, 

22 max_results: int = 10, 

23 api_key: Optional[str] = None, 

24 from_date: Optional[str] = None, 

25 to_date: Optional[str] = None, 

26 section: Optional[str] = None, 

27 order_by: str = "relevance", 

28 llm: Optional[BaseLLM] = None, 

29 max_filtered_results: Optional[int] = None, 

30 optimize_queries: bool = True, 

31 adaptive_search: bool = True, 

32 **kwargs, 

33 ): 

34 """ 

35 Initialize The Guardian search engine with enhanced features. 

36 

37 Args: 

38 max_results: Maximum number of search results 

39 api_key: The Guardian API key (can also be set in GUARDIAN_API_KEY env) 

40 from_date: Start date for search (YYYY-MM-DD format, default 1 month ago) 

41 to_date: End date for search (YYYY-MM-DD format, default today) 

42 section: Filter by section (e.g., "politics", "technology", "sport") 

43 order_by: Sort order ("relevance", "newest", "oldest") 

44 llm: Language model for relevance filtering and query optimization 

45 max_filtered_results: Maximum number of results to keep after filtering 

46 optimize_queries: Whether to optimize queries using LLM 

47 adaptive_search: Whether to use adaptive search (adjusting date ranges) 

48 """ 

49 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

50 super().__init__( 

51 llm=llm, 

52 max_filtered_results=max_filtered_results, 

53 max_results=max_results, 

54 ) 

55 # Get API key - check params or database 

56 guardian_api_key = api_key 

57 if not guardian_api_key: 

58 guardian_api_key = get_setting_from_snapshot( 

59 "search.engine.web.guardian.api_key", 

60 settings_snapshot=kwargs.get("settings_snapshot"), 

61 ) 

62 self.api_key = guardian_api_key 

63 self.optimize_queries = optimize_queries 

64 self.adaptive_search = adaptive_search 

65 

66 if not self.api_key: 

67 raise ValueError( 

68 "Guardian API key not found. Please provide api_key parameter or set it in the UI settings." 

69 ) 

70 

71 # Set date ranges if not provided 

72 if not from_date: 

73 # Default to one month ago 

74 one_month_ago = datetime.now(UTC) - timedelta(days=30) 

75 self.from_date = one_month_ago.strftime("%Y-%m-%d") 

76 else: 

77 self.from_date = from_date 

78 

79 if not to_date: 

80 # Default to today 

81 self.to_date = datetime.now(UTC).strftime("%Y-%m-%d") 

82 else: 

83 self.to_date = to_date 

84 

85 self.section = section 

86 self.order_by = order_by 

87 self._original_date_params = { 

88 "from_date": self.from_date, 

89 "to_date": self.to_date, 

90 } 

91 

92 # API base URL 

93 self.api_url = "https://content.guardianapis.com/search" 

94 

95 def _optimize_query_for_guardian(self, query: str) -> str: 

96 """ 

97 Optimize a natural language query for Guardian search. 

98 Uses LLM to transform questions into effective news search queries. 

99 

100 Args: 

101 query: Natural language query 

102 

103 Returns: 

104 Optimized query string for Guardian 

105 """ 

106 # Handle extremely long queries by truncating first 

107 if len(query) > 150: 

108 simple_query = " ".join(query.split()[:10]) 

109 logger.info( 

110 f"Query too long ({len(query)} chars), truncating to: {simple_query}" 

111 ) 

112 query = simple_query 

113 

114 if not self.llm or not self.optimize_queries: 114 ↛ 118line 114 didn't jump to line 118 because the condition on line 114 was always true

115 # Return original query if no LLM available or optimization disabled 

116 return query 

117 

118 try: 

119 # Prompt for query optimization 

120 prompt = f"""Transform this natural language question into a very short Guardian news search query. 

121 

122Original query: "{query}" 

123 

124CRITICAL RULES: 

1251. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS 

1262. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total 

1273. Focus only on the main topic/person/event 

1284. Include proper names when relevant 

1295. Remove ALL unnecessary words 

1306. DO NOT use Boolean operators (no AND/OR) 

1317. DO NOT use quotes 

132 

133EXAMPLE CONVERSIONS: 

134✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates" 

135✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations" 

136✓ "How are tech companies responding to AI regulation?" → "tech AI regulation" 

137✓ "What is Donald Trump's current political activity?" → "Trump political activity" 

138 

139Return ONLY the extremely brief search query. 

140""" 

141 

142 # Get response from LLM 

143 response = self.llm.invoke(prompt) 

144 optimized_query = remove_think_tags(response.content).strip() 

145 

146 # Clean up the query - remove any explanations 

147 lines = optimized_query.split("\n") 

148 for line in lines: 

149 line = line.strip() 

150 if line and not line.lower().startswith( 

151 ("here", "i would", "the best", "this query") 

152 ): 

153 optimized_query = line 

154 break 

155 

156 # Remove any quotes that wrap the entire query 

157 if ( 

158 optimized_query.startswith('"') 

159 and optimized_query.endswith('"') 

160 and optimized_query.count('"') == 2 

161 ): 

162 optimized_query = optimized_query[1:-1] 

163 

164 logger.info(f"Original query: '{query}'") 

165 logger.info(f"Optimized for Guardian: '{optimized_query}'") 

166 

167 return optimized_query 

168 

169 except Exception: 

170 logger.exception("Error optimizing query") 

171 return query # Fall back to original query on error 

172 

173 def _adapt_dates_for_query_type(self, query: str) -> None: 

174 """ 

175 Adapt date range based on query type (historical vs current). 

176 

177 Args: 

178 query: The search query 

179 """ 

180 # Fast path - for very short queries, default to recent news 

181 if len(query.split()) <= 4: 

182 logger.info("Short query detected, defaulting to recent news") 

183 # Default to 60 days for short queries 

184 recent = (datetime.now(UTC) - timedelta(days=60)).strftime( 

185 "%Y-%m-%d" 

186 ) 

187 self.from_date = recent 

188 self.order_by = "newest" 

189 return 

190 

191 if not self.llm or not self.adaptive_search: 191 ↛ 194line 191 didn't jump to line 194 because the condition on line 191 was always true

192 return 

193 

194 try: 

195 prompt = f"""Is this query asking about HISTORICAL events or CURRENT events? 

196 

197Query: "{query}" 

198 

199ONE WORD ANSWER ONLY: 

200- "HISTORICAL" if about past events (older than 1 year) 

201- "CURRENT" if about recent events (within past year) 

202- "UNCLEAR" if can't determine 

203 

204ONE WORD ONLY:""" 

205 

206 response = self.llm.invoke(prompt) 

207 answer = remove_think_tags(response.content).strip().upper() 

208 

209 # Reset to original parameters first 

210 self.from_date = self._original_date_params["from_date"] 

211 self.to_date = self._original_date_params["to_date"] 

212 

213 if "HISTORICAL" in answer: 

214 # For historical queries, go back 10 years 

215 logger.info( 

216 "Query classified as HISTORICAL - extending search timeframe" 

217 ) 

218 ten_years_ago = ( 

219 datetime.now(UTC) - timedelta(days=3650) 

220 ).strftime("%Y-%m-%d") 

221 self.from_date = ten_years_ago 

222 

223 elif "CURRENT" in answer: 

224 # For current events, focus on recent content 

225 logger.info( 

226 "Query classified as CURRENT - focusing on recent content" 

227 ) 

228 recent = (datetime.now(UTC) - timedelta(days=60)).strftime( 

229 "%Y-%m-%d" 

230 ) 

231 self.from_date = recent 

232 self.order_by = "newest" # Prioritize newest for current events 

233 

234 except Exception: 

235 logger.exception("Error adapting dates for query type") 

236 # Keep original date parameters on error 

237 

238 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]: 

239 """ 

240 Perform adaptive search that progressively adjusts parameters based on results. 

241 

242 Args: 

243 query: The search query 

244 

245 Returns: 

246 Tuple of (list of articles, search strategy used) 

247 """ 

248 # Try with current parameters 

249 articles = self._get_all_data(query) 

250 strategy = "initial" 

251 

252 # If no results or too few, try different strategies 

253 if len(articles) < 3 and self.adaptive_search: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 logger.info( 

255 f"Initial search found only {len(articles)} results, trying alternative strategies" 

256 ) 

257 

258 # Try with expanded date range 

259 original_from_date = self.from_date 

260 original_order_by = self.order_by 

261 

262 # Strategy 1: Expand to 6 months 

263 logger.info("Strategy 1: Expanding time range to 6 months") 

264 six_months_ago = (datetime.now(UTC) - timedelta(days=180)).strftime( 

265 "%Y-%m-%d" 

266 ) 

267 self.from_date = six_months_ago 

268 

269 articles1 = self._get_all_data(query) 

270 if len(articles1) > len(articles): 

271 articles = articles1 

272 strategy = "expanded_6mo" 

273 

274 # Strategy 2: Expand to all time and try relevance order 

275 if len(articles) < 3: 

276 logger.info( 

277 "Strategy 2: Expanding to all time with relevance ordering" 

278 ) 

279 self.from_date = "2000-01-01" # Effectively "all time" 

280 self.order_by = "relevance" 

281 

282 articles2 = self._get_all_data(query) 

283 if len(articles2) > len(articles): 

284 articles = articles2 

285 strategy = "all_time_relevance" 

286 

287 # Strategy 3: Try removing section constraints 

288 if len(articles) < 3 and self.section: 

289 logger.info("Strategy 3: Removing section constraint") 

290 original_section = self.section 

291 self.section = None 

292 

293 articles3 = self._get_all_data(query) 

294 if len(articles3) > len(articles): 

295 articles = articles3 

296 strategy = "no_section" 

297 

298 # Restore section setting 

299 self.section = original_section 

300 

301 # Restore original settings 

302 self.from_date = original_from_date 

303 self.order_by = original_order_by 

304 

305 logger.info( 

306 f"Adaptive search using strategy '{strategy}' found {len(articles)} results" 

307 ) 

308 return articles, strategy 

309 

310 def _get_all_data(self, query: str) -> List[Dict[str, Any]]: 

311 """ 

312 Get all article data from The Guardian API in a single call. 

313 Always requests all fields for simplicity. 

314 

315 Args: 

316 query: The search query 

317 

318 Returns: 

319 List of articles with all data 

320 """ 

321 try: 

322 # Ensure query is not empty 

323 if not query or query.strip() == "": 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true

324 query = "news" 

325 logger.warning("Empty query provided, using 'news' as default") 

326 

327 # Ensure query is not too long for API 

328 if len(query) > 100: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true

329 logger.warning( 

330 f"Query too long for Guardian API ({len(query)} chars), truncating" 

331 ) 

332 query = query[:100] 

333 

334 # Always request all fields for simplicity 

335 # Ensure max_results is an integer to avoid comparison errors 

336 page_size = min( 

337 int(self.max_results) if self.max_results is not None else 10, 

338 50, 

339 ) 

340 

341 # Log full parameters for debugging 

342 logger.info(f"Guardian API search query: '{query}'") 

343 logger.info( 

344 f"Guardian API date range: {self.from_date} to {self.to_date}" 

345 ) 

346 

347 params = { 

348 "q": query, 

349 "api-key": self.api_key, 

350 "from-date": self.from_date, 

351 "to-date": self.to_date, 

352 "order-by": self.order_by, 

353 "page-size": page_size, # API maximum is 50 

354 "show-fields": "headline,trailText,byline,body,publication", 

355 "show-tags": "keyword", 

356 } 

357 

358 # Add section filter if specified 

359 if self.section: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 params["section"] = self.section 

361 

362 # Log the complete request parameters (except API key) 

363 log_params = params.copy() 

364 log_params["api-key"] = "REDACTED" 

365 logger.info(f"Guardian API request parameters: {log_params}") 

366 

367 # Apply rate limiting before request 

368 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

369 self.engine_type 

370 ) 

371 

372 # Execute the API request 

373 response = safe_get(self.api_url, params=params) 

374 response.raise_for_status() 

375 

376 data = response.json() 

377 

378 # Extract results from the response 

379 articles = data.get("response", {}).get("results", []) 

380 logger.info(f"Guardian API returned {len(articles)} articles") 

381 

382 # Format results to include all data 

383 formatted_articles = [] 

384 for i, article in enumerate(articles): 

385 if i >= self.max_results: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true

386 break 

387 

388 fields = article.get("fields", {}) 

389 

390 # Format the article with all fields 

391 result = { 

392 "id": article.get("id", ""), 

393 "title": fields.get( 

394 "headline", article.get("webTitle", "") 

395 ), 

396 "link": article.get("webUrl", ""), 

397 "snippet": fields.get("trailText", ""), 

398 "publication_date": article.get("webPublicationDate", ""), 

399 "section": article.get("sectionName", ""), 

400 "author": fields.get("byline", ""), 

401 "content": fields.get("body", ""), 

402 "full_content": fields.get("body", ""), 

403 } 

404 

405 # Extract tags/keywords 

406 tags = article.get("tags", []) 

407 result["keywords"] = [ 

408 tag.get("webTitle", "") 

409 for tag in tags 

410 if tag.get("type") == "keyword" 

411 ] 

412 

413 formatted_articles.append(result) 

414 

415 return formatted_articles 

416 

417 except Exception: 

418 logger.exception("Error getting data from The Guardian API") 

419 return [] 

420 

421 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

422 """ 

423 Get preview information for Guardian articles with enhanced optimization. 

424 

425 Args: 

426 query: The search query 

427 

428 Returns: 

429 List of preview dictionaries 

430 """ 

431 logger.info( 

432 f"Getting articles from The Guardian API for query: {query}" 

433 ) 

434 

435 # Step 1: Optimize the query using LLM 

436 optimized_query = self._optimize_query_for_guardian(query) 

437 

438 # Step 2: Adapt date parameters based on query type 

439 self._adapt_dates_for_query_type(optimized_query) 

440 

441 # Step 3: Perform adaptive search 

442 articles, strategy = self._adaptive_search(optimized_query) 

443 

444 # Store search metadata for debugging 

445 self._search_metadata = { 

446 "original_query": query, 

447 "optimized_query": optimized_query, 

448 "strategy": strategy, 

449 "from_date": self.from_date, 

450 "to_date": self.to_date, 

451 "section": self.section, 

452 "order_by": self.order_by, 

453 } 

454 

455 # Store full articles for later use 

456 self._full_articles = {a["id"]: a for a in articles} 

457 

458 # Return only preview fields for each article 

459 previews = [] 

460 for article in articles: 460 ↛ 461line 460 didn't jump to line 461 because the loop on line 460 never started

461 preview = { 

462 "id": article["id"], 

463 "title": article["title"], 

464 "link": article["link"], 

465 "snippet": article["snippet"], 

466 "publication_date": article["publication_date"], 

467 "section": article["section"], 

468 "author": article["author"], 

469 "keywords": article.get("keywords", []), 

470 } 

471 previews.append(preview) 

472 

473 return previews 

474 

475 def _get_full_content( 

476 self, relevant_items: List[Dict[str, Any]] 

477 ) -> List[Dict[str, Any]]: 

478 """ 

479 Get full content for the relevant Guardian articles. 

480 Restores full content from the cached data. 

481 

482 Args: 

483 relevant_items: List of relevant preview dictionaries 

484 

485 Returns: 

486 List of result dictionaries with full content 

487 """ 

488 logger.info( 

489 f"Adding full content to {len(relevant_items)} relevant Guardian articles" 

490 ) 

491 

492 # Check if we should add full content 

493 if ( 

494 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

495 and search_config.SEARCH_SNIPPETS_ONLY 

496 ): 

497 return relevant_items 

498 

499 # Get full articles for relevant items 

500 results = [] 

501 for item in relevant_items: 

502 article_id = item.get("id", "") 

503 

504 # Get the full article from our cache 

505 if ( 

506 hasattr(self, "_full_articles") 

507 and article_id in self._full_articles 

508 ): 

509 results.append(self._full_articles[article_id]) 

510 else: 

511 # If not found (shouldn't happen), just use the preview 

512 results.append(item) 

513 

514 return results 

515 

516 def run( 

517 self, query: str, research_context: Dict[str, Any] | None = None 

518 ) -> List[Dict[str, Any]]: 

519 """ 

520 Execute a search using The Guardian API with the enhanced approach. 

521 

522 Args: 

523 query: The search query 

524 research_context: Context from previous research to use. 

525 

526 Returns: 

527 List of search results 

528 """ 

529 logger.info("---Execute a search using The Guardian (enhanced)---") 

530 

531 # Additional safety check for None query 

532 if query is None: 532 ↛ 536line 532 didn't jump to line 536 because the condition on line 532 was always true

533 logger.error("None query passed to Guardian search engine") 

534 query = "news" 

535 

536 try: 

537 # Get previews with our enhanced method 

538 previews = self._get_previews(query) 

539 

540 # If no results, try one more time with a simplified query 

541 if not previews: 541 ↛ 559line 541 didn't jump to line 559 because the condition on line 541 was always true

542 simple_query = " ".join( 

543 [w for w in query.split() if len(w) > 3][:3] 

544 ) 

545 logger.warning( 

546 f"No Guardian articles found, trying simplified query: {simple_query}" 

547 ) 

548 previews = self._get_previews(simple_query) 

549 

550 # If still no results, try with a very generic query as last resort 

551 if not previews and "trump" in query.lower(): 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true

552 logger.warning("Trying last resort query: 'Donald Trump'") 

553 previews = self._get_previews("Donald Trump") 

554 elif not previews: 554 ↛ 559line 554 didn't jump to line 559 because the condition on line 554 was always true

555 logger.warning("Trying last resort query: 'news'") 

556 previews = self._get_previews("news") 

557 

558 # If still no results after all attempts, return empty list 

559 if not previews: 559 ↛ 566line 559 didn't jump to line 566 because the condition on line 559 was always true

560 logger.warning( 

561 "No Guardian articles found after multiple attempts" 

562 ) 

563 return [] 

564 

565 # Filter for relevance if we have an LLM 

566 if ( 

567 self.llm 

568 and hasattr(self, "max_filtered_results") 

569 and self.max_filtered_results 

570 ): 

571 filtered_items = self._filter_for_relevance(previews, query) 

572 if not filtered_items: 

573 # Fall back to unfiltered results if everything was filtered out 

574 logger.warning( 

575 "All articles filtered out, using unfiltered results" 

576 ) 

577 filtered_items = previews[: self.max_filtered_results] 

578 else: 

579 filtered_items = previews 

580 

581 # Get full content for relevant items 

582 results = self._get_full_content(filtered_items) 

583 

584 # Add source information to make it clear these are from The Guardian 

585 for result in results: 

586 if "source" not in result: 

587 result["source"] = "The Guardian" 

588 

589 # Clean up the cache after use 

590 if hasattr(self, "_full_articles"): 

591 del self._full_articles 

592 

593 # Restore original date parameters 

594 self.from_date = self._original_date_params["from_date"] 

595 self.to_date = self._original_date_params["to_date"] 

596 

597 # Log search metadata if available 

598 if hasattr(self, "_search_metadata"): 

599 logger.info(f"Search metadata: {self._search_metadata}") 

600 del self._search_metadata 

601 

602 return results 

603 

604 except Exception: 

605 logger.exception("Error in Guardian search") 

606 

607 # Restore original date parameters on error 

608 self.from_date = self._original_date_params["from_date"] 

609 self.to_date = self._original_date_params["to_date"] 

610 

611 return [] 

612 

613 def search_by_section( 

614 self, section: str, max_results: Optional[int] = None 

615 ) -> List[Dict[str, Any]]: 

616 """ 

617 Search for articles in a specific section. 

618 

619 Args: 

620 section: The Guardian section name (e.g., "politics", "technology") 

621 max_results: Maximum number of results (defaults to self.max_results) 

622 

623 Returns: 

624 List of articles in the section 

625 """ 

626 original_section = self.section 

627 original_max_results = self.max_results 

628 

629 try: 

630 # Set section and max_results for this search 

631 self.section = section 

632 if max_results: 

633 self.max_results = max_results 

634 

635 # Use empty query to get all articles in the section 

636 return self.run("") 

637 

638 finally: 

639 # Restore original values 

640 self.section = original_section 

641 self.max_results = original_max_results 

642 

643 def get_recent_articles( 

644 self, days: int = 7, max_results: Optional[int] = None 

645 ) -> List[Dict[str, Any]]: 

646 """ 

647 Get recent articles from The Guardian. 

648 

649 Args: 

650 days: Number of days to look back 

651 max_results: Maximum number of results (defaults to self.max_results) 

652 

653 Returns: 

654 List of recent articles 

655 """ 

656 original_from_date = self.from_date 

657 original_order_by = self.order_by 

658 original_max_results = self.max_results 

659 

660 try: 

661 # Set parameters for this search 

662 self.from_date = ( 

663 datetime.now(UTC) - timedelta(days=days) 

664 ).strftime("%Y-%m-%d") 

665 self.order_by = "newest" 

666 if max_results: 

667 self.max_results = max_results 

668 

669 # Use empty query to get all recent articles 

670 return self.run("") 

671 

672 finally: 

673 # Restore original values 

674 self.from_date = original_from_date 

675 self.order_by = original_order_by 

676 self.max_results = original_max_results