Coverage for src / local_deep_research / web_search_engines / engines / search_engine_guardian.py: 44%
243 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from datetime import datetime, timedelta, UTC
2from typing import Any, Dict, List, Optional, Tuple
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...config import search_config
8from ...config.search_config import get_setting_from_snapshot
9from ...utilities.search_utilities import remove_think_tags
10from ...security.safe_requests import safe_get
11from ..search_engine_base import BaseSearchEngine
14class GuardianSearchEngine(BaseSearchEngine):
15 """Enhanced Guardian API search engine implementation with LLM query optimization"""
17 # Mark as public search engine
18 is_public = True
20 def __init__(
21 self,
22 max_results: int = 10,
23 api_key: Optional[str] = None,
24 from_date: Optional[str] = None,
25 to_date: Optional[str] = None,
26 section: Optional[str] = None,
27 order_by: str = "relevance",
28 llm: Optional[BaseLLM] = None,
29 max_filtered_results: Optional[int] = None,
30 optimize_queries: bool = True,
31 adaptive_search: bool = True,
32 **kwargs,
33 ):
34 """
35 Initialize The Guardian search engine with enhanced features.
37 Args:
38 max_results: Maximum number of search results
39 api_key: The Guardian API key (can also be set in GUARDIAN_API_KEY env)
40 from_date: Start date for search (YYYY-MM-DD format, default 1 month ago)
41 to_date: End date for search (YYYY-MM-DD format, default today)
42 section: Filter by section (e.g., "politics", "technology", "sport")
43 order_by: Sort order ("relevance", "newest", "oldest")
44 llm: Language model for relevance filtering and query optimization
45 max_filtered_results: Maximum number of results to keep after filtering
46 optimize_queries: Whether to optimize queries using LLM
47 adaptive_search: Whether to use adaptive search (adjusting date ranges)
48 """
49 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
50 super().__init__(
51 llm=llm,
52 max_filtered_results=max_filtered_results,
53 max_results=max_results,
54 )
55 # Get API key - check params or database
56 guardian_api_key = api_key
57 if not guardian_api_key:
58 guardian_api_key = get_setting_from_snapshot(
59 "search.engine.web.guardian.api_key",
60 settings_snapshot=kwargs.get("settings_snapshot"),
61 )
62 self.api_key = guardian_api_key
63 self.optimize_queries = optimize_queries
64 self.adaptive_search = adaptive_search
66 if not self.api_key:
67 raise ValueError(
68 "Guardian API key not found. Please provide api_key parameter or set it in the UI settings."
69 )
71 # Set date ranges if not provided
72 if not from_date:
73 # Default to one month ago
74 one_month_ago = datetime.now(UTC) - timedelta(days=30)
75 self.from_date = one_month_ago.strftime("%Y-%m-%d")
76 else:
77 self.from_date = from_date
79 if not to_date:
80 # Default to today
81 self.to_date = datetime.now(UTC).strftime("%Y-%m-%d")
82 else:
83 self.to_date = to_date
85 self.section = section
86 self.order_by = order_by
87 self._original_date_params = {
88 "from_date": self.from_date,
89 "to_date": self.to_date,
90 }
92 # API base URL
93 self.api_url = "https://content.guardianapis.com/search"
95 def _optimize_query_for_guardian(self, query: str) -> str:
96 """
97 Optimize a natural language query for Guardian search.
98 Uses LLM to transform questions into effective news search queries.
100 Args:
101 query: Natural language query
103 Returns:
104 Optimized query string for Guardian
105 """
106 # Handle extremely long queries by truncating first
107 if len(query) > 150:
108 simple_query = " ".join(query.split()[:10])
109 logger.info(
110 f"Query too long ({len(query)} chars), truncating to: {simple_query}"
111 )
112 query = simple_query
114 if not self.llm or not self.optimize_queries: 114 ↛ 118line 114 didn't jump to line 118 because the condition on line 114 was always true
115 # Return original query if no LLM available or optimization disabled
116 return query
118 try:
119 # Prompt for query optimization
120 prompt = f"""Transform this natural language question into a very short Guardian news search query.
122Original query: "{query}"
124CRITICAL RULES:
1251. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
1262. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total
1273. Focus only on the main topic/person/event
1284. Include proper names when relevant
1295. Remove ALL unnecessary words
1306. DO NOT use Boolean operators (no AND/OR)
1317. DO NOT use quotes
133EXAMPLE CONVERSIONS:
134✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates"
135✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations"
136✓ "How are tech companies responding to AI regulation?" → "tech AI regulation"
137✓ "What is Donald Trump's current political activity?" → "Trump political activity"
139Return ONLY the extremely brief search query.
140"""
142 # Get response from LLM
143 response = self.llm.invoke(prompt)
144 optimized_query = remove_think_tags(response.content).strip()
146 # Clean up the query - remove any explanations
147 lines = optimized_query.split("\n")
148 for line in lines:
149 line = line.strip()
150 if line and not line.lower().startswith(
151 ("here", "i would", "the best", "this query")
152 ):
153 optimized_query = line
154 break
156 # Remove any quotes that wrap the entire query
157 if (
158 optimized_query.startswith('"')
159 and optimized_query.endswith('"')
160 and optimized_query.count('"') == 2
161 ):
162 optimized_query = optimized_query[1:-1]
164 logger.info(f"Original query: '{query}'")
165 logger.info(f"Optimized for Guardian: '{optimized_query}'")
167 return optimized_query
169 except Exception:
170 logger.exception("Error optimizing query")
171 return query # Fall back to original query on error
173 def _adapt_dates_for_query_type(self, query: str) -> None:
174 """
175 Adapt date range based on query type (historical vs current).
177 Args:
178 query: The search query
179 """
180 # Fast path - for very short queries, default to recent news
181 if len(query.split()) <= 4:
182 logger.info("Short query detected, defaulting to recent news")
183 # Default to 60 days for short queries
184 recent = (datetime.now(UTC) - timedelta(days=60)).strftime(
185 "%Y-%m-%d"
186 )
187 self.from_date = recent
188 self.order_by = "newest"
189 return
191 if not self.llm or not self.adaptive_search: 191 ↛ 194line 191 didn't jump to line 194 because the condition on line 191 was always true
192 return
194 try:
195 prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?
197Query: "{query}"
199ONE WORD ANSWER ONLY:
200- "HISTORICAL" if about past events (older than 1 year)
201- "CURRENT" if about recent events (within past year)
202- "UNCLEAR" if can't determine
204ONE WORD ONLY:"""
206 response = self.llm.invoke(prompt)
207 answer = remove_think_tags(response.content).strip().upper()
209 # Reset to original parameters first
210 self.from_date = self._original_date_params["from_date"]
211 self.to_date = self._original_date_params["to_date"]
213 if "HISTORICAL" in answer:
214 # For historical queries, go back 10 years
215 logger.info(
216 "Query classified as HISTORICAL - extending search timeframe"
217 )
218 ten_years_ago = (
219 datetime.now(UTC) - timedelta(days=3650)
220 ).strftime("%Y-%m-%d")
221 self.from_date = ten_years_ago
223 elif "CURRENT" in answer:
224 # For current events, focus on recent content
225 logger.info(
226 "Query classified as CURRENT - focusing on recent content"
227 )
228 recent = (datetime.now(UTC) - timedelta(days=60)).strftime(
229 "%Y-%m-%d"
230 )
231 self.from_date = recent
232 self.order_by = "newest" # Prioritize newest for current events
234 except Exception:
235 logger.exception("Error adapting dates for query type")
236 # Keep original date parameters on error
238 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
239 """
240 Perform adaptive search that progressively adjusts parameters based on results.
242 Args:
243 query: The search query
245 Returns:
246 Tuple of (list of articles, search strategy used)
247 """
248 # Try with current parameters
249 articles = self._get_all_data(query)
250 strategy = "initial"
252 # If no results or too few, try different strategies
253 if len(articles) < 3 and self.adaptive_search: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 logger.info(
255 f"Initial search found only {len(articles)} results, trying alternative strategies"
256 )
258 # Try with expanded date range
259 original_from_date = self.from_date
260 original_order_by = self.order_by
262 # Strategy 1: Expand to 6 months
263 logger.info("Strategy 1: Expanding time range to 6 months")
264 six_months_ago = (datetime.now(UTC) - timedelta(days=180)).strftime(
265 "%Y-%m-%d"
266 )
267 self.from_date = six_months_ago
269 articles1 = self._get_all_data(query)
270 if len(articles1) > len(articles):
271 articles = articles1
272 strategy = "expanded_6mo"
274 # Strategy 2: Expand to all time and try relevance order
275 if len(articles) < 3:
276 logger.info(
277 "Strategy 2: Expanding to all time with relevance ordering"
278 )
279 self.from_date = "2000-01-01" # Effectively "all time"
280 self.order_by = "relevance"
282 articles2 = self._get_all_data(query)
283 if len(articles2) > len(articles):
284 articles = articles2
285 strategy = "all_time_relevance"
287 # Strategy 3: Try removing section constraints
288 if len(articles) < 3 and self.section:
289 logger.info("Strategy 3: Removing section constraint")
290 original_section = self.section
291 self.section = None
293 articles3 = self._get_all_data(query)
294 if len(articles3) > len(articles):
295 articles = articles3
296 strategy = "no_section"
298 # Restore section setting
299 self.section = original_section
301 # Restore original settings
302 self.from_date = original_from_date
303 self.order_by = original_order_by
305 logger.info(
306 f"Adaptive search using strategy '{strategy}' found {len(articles)} results"
307 )
308 return articles, strategy
310 def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
311 """
312 Get all article data from The Guardian API in a single call.
313 Always requests all fields for simplicity.
315 Args:
316 query: The search query
318 Returns:
319 List of articles with all data
320 """
321 try:
322 # Ensure query is not empty
323 if not query or query.strip() == "": 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true
324 query = "news"
325 logger.warning("Empty query provided, using 'news' as default")
327 # Ensure query is not too long for API
328 if len(query) > 100: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true
329 logger.warning(
330 f"Query too long for Guardian API ({len(query)} chars), truncating"
331 )
332 query = query[:100]
334 # Always request all fields for simplicity
335 # Ensure max_results is an integer to avoid comparison errors
336 page_size = min(
337 int(self.max_results) if self.max_results is not None else 10,
338 50,
339 )
341 # Log full parameters for debugging
342 logger.info(f"Guardian API search query: '{query}'")
343 logger.info(
344 f"Guardian API date range: {self.from_date} to {self.to_date}"
345 )
347 params = {
348 "q": query,
349 "api-key": self.api_key,
350 "from-date": self.from_date,
351 "to-date": self.to_date,
352 "order-by": self.order_by,
353 "page-size": page_size, # API maximum is 50
354 "show-fields": "headline,trailText,byline,body,publication",
355 "show-tags": "keyword",
356 }
358 # Add section filter if specified
359 if self.section: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true
360 params["section"] = self.section
362 # Log the complete request parameters (except API key)
363 log_params = params.copy()
364 log_params["api-key"] = "REDACTED"
365 logger.info(f"Guardian API request parameters: {log_params}")
367 # Apply rate limiting before request
368 self._last_wait_time = self.rate_tracker.apply_rate_limit(
369 self.engine_type
370 )
372 # Execute the API request
373 response = safe_get(self.api_url, params=params)
374 response.raise_for_status()
376 data = response.json()
378 # Extract results from the response
379 articles = data.get("response", {}).get("results", [])
380 logger.info(f"Guardian API returned {len(articles)} articles")
382 # Format results to include all data
383 formatted_articles = []
384 for i, article in enumerate(articles):
385 if i >= self.max_results: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true
386 break
388 fields = article.get("fields", {})
390 # Format the article with all fields
391 result = {
392 "id": article.get("id", ""),
393 "title": fields.get(
394 "headline", article.get("webTitle", "")
395 ),
396 "link": article.get("webUrl", ""),
397 "snippet": fields.get("trailText", ""),
398 "publication_date": article.get("webPublicationDate", ""),
399 "section": article.get("sectionName", ""),
400 "author": fields.get("byline", ""),
401 "content": fields.get("body", ""),
402 "full_content": fields.get("body", ""),
403 }
405 # Extract tags/keywords
406 tags = article.get("tags", [])
407 result["keywords"] = [
408 tag.get("webTitle", "")
409 for tag in tags
410 if tag.get("type") == "keyword"
411 ]
413 formatted_articles.append(result)
415 return formatted_articles
417 except Exception:
418 logger.exception("Error getting data from The Guardian API")
419 return []
421 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
422 """
423 Get preview information for Guardian articles with enhanced optimization.
425 Args:
426 query: The search query
428 Returns:
429 List of preview dictionaries
430 """
431 logger.info(
432 f"Getting articles from The Guardian API for query: {query}"
433 )
435 # Step 1: Optimize the query using LLM
436 optimized_query = self._optimize_query_for_guardian(query)
438 # Step 2: Adapt date parameters based on query type
439 self._adapt_dates_for_query_type(optimized_query)
441 # Step 3: Perform adaptive search
442 articles, strategy = self._adaptive_search(optimized_query)
444 # Store search metadata for debugging
445 self._search_metadata = {
446 "original_query": query,
447 "optimized_query": optimized_query,
448 "strategy": strategy,
449 "from_date": self.from_date,
450 "to_date": self.to_date,
451 "section": self.section,
452 "order_by": self.order_by,
453 }
455 # Store full articles for later use
456 self._full_articles = {a["id"]: a for a in articles}
458 # Return only preview fields for each article
459 previews = []
460 for article in articles: 460 ↛ 461line 460 didn't jump to line 461 because the loop on line 460 never started
461 preview = {
462 "id": article["id"],
463 "title": article["title"],
464 "link": article["link"],
465 "snippet": article["snippet"],
466 "publication_date": article["publication_date"],
467 "section": article["section"],
468 "author": article["author"],
469 "keywords": article.get("keywords", []),
470 }
471 previews.append(preview)
473 return previews
475 def _get_full_content(
476 self, relevant_items: List[Dict[str, Any]]
477 ) -> List[Dict[str, Any]]:
478 """
479 Get full content for the relevant Guardian articles.
480 Restores full content from the cached data.
482 Args:
483 relevant_items: List of relevant preview dictionaries
485 Returns:
486 List of result dictionaries with full content
487 """
488 logger.info(
489 f"Adding full content to {len(relevant_items)} relevant Guardian articles"
490 )
492 # Check if we should add full content
493 if (
494 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
495 and search_config.SEARCH_SNIPPETS_ONLY
496 ):
497 return relevant_items
499 # Get full articles for relevant items
500 results = []
501 for item in relevant_items:
502 article_id = item.get("id", "")
504 # Get the full article from our cache
505 if (
506 hasattr(self, "_full_articles")
507 and article_id in self._full_articles
508 ):
509 results.append(self._full_articles[article_id])
510 else:
511 # If not found (shouldn't happen), just use the preview
512 results.append(item)
514 return results
516 def run(
517 self, query: str, research_context: Dict[str, Any] | None = None
518 ) -> List[Dict[str, Any]]:
519 """
520 Execute a search using The Guardian API with the enhanced approach.
522 Args:
523 query: The search query
524 research_context: Context from previous research to use.
526 Returns:
527 List of search results
528 """
529 logger.info("---Execute a search using The Guardian (enhanced)---")
531 # Additional safety check for None query
532 if query is None: 532 ↛ 536line 532 didn't jump to line 536 because the condition on line 532 was always true
533 logger.error("None query passed to Guardian search engine")
534 query = "news"
536 try:
537 # Get previews with our enhanced method
538 previews = self._get_previews(query)
540 # If no results, try one more time with a simplified query
541 if not previews: 541 ↛ 559line 541 didn't jump to line 559 because the condition on line 541 was always true
542 simple_query = " ".join(
543 [w for w in query.split() if len(w) > 3][:3]
544 )
545 logger.warning(
546 f"No Guardian articles found, trying simplified query: {simple_query}"
547 )
548 previews = self._get_previews(simple_query)
550 # If still no results, try with a very generic query as last resort
551 if not previews and "trump" in query.lower(): 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true
552 logger.warning("Trying last resort query: 'Donald Trump'")
553 previews = self._get_previews("Donald Trump")
554 elif not previews: 554 ↛ 559line 554 didn't jump to line 559 because the condition on line 554 was always true
555 logger.warning("Trying last resort query: 'news'")
556 previews = self._get_previews("news")
558 # If still no results after all attempts, return empty list
559 if not previews: 559 ↛ 566line 559 didn't jump to line 566 because the condition on line 559 was always true
560 logger.warning(
561 "No Guardian articles found after multiple attempts"
562 )
563 return []
565 # Filter for relevance if we have an LLM
566 if (
567 self.llm
568 and hasattr(self, "max_filtered_results")
569 and self.max_filtered_results
570 ):
571 filtered_items = self._filter_for_relevance(previews, query)
572 if not filtered_items:
573 # Fall back to unfiltered results if everything was filtered out
574 logger.warning(
575 "All articles filtered out, using unfiltered results"
576 )
577 filtered_items = previews[: self.max_filtered_results]
578 else:
579 filtered_items = previews
581 # Get full content for relevant items
582 results = self._get_full_content(filtered_items)
584 # Add source information to make it clear these are from The Guardian
585 for result in results:
586 if "source" not in result:
587 result["source"] = "The Guardian"
589 # Clean up the cache after use
590 if hasattr(self, "_full_articles"):
591 del self._full_articles
593 # Restore original date parameters
594 self.from_date = self._original_date_params["from_date"]
595 self.to_date = self._original_date_params["to_date"]
597 # Log search metadata if available
598 if hasattr(self, "_search_metadata"):
599 logger.info(f"Search metadata: {self._search_metadata}")
600 del self._search_metadata
602 return results
604 except Exception:
605 logger.exception("Error in Guardian search")
607 # Restore original date parameters on error
608 self.from_date = self._original_date_params["from_date"]
609 self.to_date = self._original_date_params["to_date"]
611 return []
613 def search_by_section(
614 self, section: str, max_results: Optional[int] = None
615 ) -> List[Dict[str, Any]]:
616 """
617 Search for articles in a specific section.
619 Args:
620 section: The Guardian section name (e.g., "politics", "technology")
621 max_results: Maximum number of results (defaults to self.max_results)
623 Returns:
624 List of articles in the section
625 """
626 original_section = self.section
627 original_max_results = self.max_results
629 try:
630 # Set section and max_results for this search
631 self.section = section
632 if max_results:
633 self.max_results = max_results
635 # Use empty query to get all articles in the section
636 return self.run("")
638 finally:
639 # Restore original values
640 self.section = original_section
641 self.max_results = original_max_results
643 def get_recent_articles(
644 self, days: int = 7, max_results: Optional[int] = None
645 ) -> List[Dict[str, Any]]:
646 """
647 Get recent articles from The Guardian.
649 Args:
650 days: Number of days to look back
651 max_results: Maximum number of results (defaults to self.max_results)
653 Returns:
654 List of recent articles
655 """
656 original_from_date = self.from_date
657 original_order_by = self.order_by
658 original_max_results = self.max_results
660 try:
661 # Set parameters for this search
662 self.from_date = (
663 datetime.now(UTC) - timedelta(days=days)
664 ).strftime("%Y-%m-%d")
665 self.order_by = "newest"
666 if max_results:
667 self.max_results = max_results
669 # Use empty query to get all recent articles
670 return self.run("")
672 finally:
673 # Restore original values
674 self.from_date = original_from_date
675 self.order_by = original_order_by
676 self.max_results = original_max_results