Coverage for src / local_deep_research / web_search_engines / engines / search_engine_guardian.py: 95%
241 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1from datetime import datetime, timedelta, UTC
2from typing import Any, Dict, List, Optional, Tuple
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...utilities.search_utilities import remove_think_tags
8from ...security.safe_requests import safe_get
9from ..rate_limiting import RateLimitError
10from ..search_engine_base import BaseSearchEngine
13class GuardianSearchEngine(BaseSearchEngine):
14 """Enhanced Guardian API search engine implementation with LLM query optimization"""
16 # Mark as public search engine
17 is_public = True
19 def __init__(
20 self,
21 max_results: int = 10,
22 api_key: Optional[str] = None,
23 from_date: Optional[str] = None,
24 to_date: Optional[str] = None,
25 section: Optional[str] = None,
26 order_by: str = "relevance",
27 llm: Optional[BaseLLM] = None,
28 max_filtered_results: Optional[int] = None,
29 optimize_queries: bool = True,
30 adaptive_search: bool = True,
31 settings_snapshot: Optional[Dict[str, Any]] = None,
32 **kwargs,
33 ):
34 """
35 Initialize The Guardian search engine with enhanced features.
37 Args:
38 max_results: Maximum number of search results
39 api_key: The Guardian API key (can also be set via LDR_SEARCH_ENGINE_WEB_GUARDIAN_API_KEY env var or in UI settings)
40 from_date: Start date for search (YYYY-MM-DD format, default 1 month ago)
41 to_date: End date for search (YYYY-MM-DD format, default today)
42 section: Filter by section (e.g., "politics", "technology", "sport")
43 order_by: Sort order ("relevance", "newest", "oldest")
44 llm: Language model for relevance filtering and query optimization
45 max_filtered_results: Maximum number of results to keep after filtering
46 optimize_queries: Whether to optimize queries using LLM
47 adaptive_search: Whether to use adaptive search (adjusting date ranges)
48 """
49 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
50 super().__init__(
51 llm=llm,
52 max_filtered_results=max_filtered_results,
53 max_results=max_results,
54 settings_snapshot=settings_snapshot,
55 )
57 # Get API key - check params, settings, or env vars
58 self.api_key = self._resolve_api_key(
59 api_key,
60 "search.engine.web.guardian.api_key",
61 engine_name="Guardian",
62 settings_snapshot=settings_snapshot,
63 )
64 self.optimize_queries = optimize_queries
65 self.adaptive_search = adaptive_search
67 # Set date ranges if not provided
68 if not from_date:
69 # Default to one month ago
70 one_month_ago = datetime.now(UTC) - timedelta(days=30)
71 self.from_date = one_month_ago.strftime("%Y-%m-%d")
72 else:
73 self.from_date = from_date
75 if not to_date:
76 # Default to today
77 self.to_date = datetime.now(UTC).strftime("%Y-%m-%d")
78 else:
79 self.to_date = to_date
81 self.section = section
82 self.order_by = order_by
83 self._original_date_params = {
84 "from_date": self.from_date,
85 "to_date": self.to_date,
86 }
88 # API base URL
89 self.api_url = "https://content.guardianapis.com/search"
91 def _optimize_query_for_guardian(self, query: str) -> str:
92 """
93 Optimize a natural language query for Guardian search.
94 Uses LLM to transform questions into effective news search queries.
96 Args:
97 query: Natural language query
99 Returns:
100 Optimized query string for Guardian
101 """
102 # Handle extremely long queries by truncating first
103 if len(query) > 150:
104 simple_query = " ".join(query.split()[:10])
105 logger.info(
106 f"Query too long ({len(query)} chars), truncating to: {simple_query}"
107 )
108 query = simple_query
110 if not self.llm or not self.optimize_queries:
111 # Return original query if no LLM available or optimization disabled
112 return query
114 try:
115 # Prompt for query optimization
116 prompt = f"""Transform this natural language question into a very short Guardian news search query.
118Original query: "{query}"
120CRITICAL RULES:
1211. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
1222. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total
1233. Focus only on the main topic/person/event
1244. Include proper names when relevant
1255. Remove ALL unnecessary words
1266. DO NOT use Boolean operators (no AND/OR)
1277. DO NOT use quotes
129EXAMPLE CONVERSIONS:
130✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates"
131✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations"
132✓ "How are tech companies responding to AI regulation?" → "tech AI regulation"
133✓ "What is Donald Trump's current political activity?" → "Trump political activity"
135Return ONLY the extremely brief search query.
136"""
138 # Get response from LLM
139 response = self.llm.invoke(prompt)
140 optimized_query = remove_think_tags(
141 str(response.content)
142 if hasattr(response, "content")
143 else str(response)
144 ).strip()
146 # Clean up the query - remove any explanations
147 lines = optimized_query.split("\n")
148 for line in lines:
149 line = line.strip()
150 if line and not line.lower().startswith(
151 ("here", "i would", "the best", "this query")
152 ):
153 optimized_query = line
154 break
156 # Remove any quotes that wrap the entire query
157 if (
158 optimized_query.startswith('"')
159 and optimized_query.endswith('"')
160 and optimized_query.count('"') == 2
161 ):
162 optimized_query = optimized_query[1:-1]
164 logger.info(f"Original query: '{query}'")
165 logger.info(f"Optimized for Guardian: '{optimized_query}'")
167 return optimized_query
169 except Exception:
170 logger.exception("Error optimizing query")
171 return query # Fall back to original query on error
173 def _adapt_dates_for_query_type(self, query: str) -> None:
174 """
175 Adapt date range based on query type (historical vs current).
177 Args:
178 query: The search query
179 """
180 # Fast path - for very short queries, default to recent news
181 if len(query.split()) <= 4:
182 logger.info("Short query detected, defaulting to recent news")
183 # Default to 60 days for short queries
184 recent = (datetime.now(UTC) - timedelta(days=60)).strftime(
185 "%Y-%m-%d"
186 )
187 self.from_date = recent
188 self.order_by = "newest"
189 return
191 if not self.llm or not self.adaptive_search:
192 return
194 try:
195 prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?
197Query: "{query}"
199ONE WORD ANSWER ONLY:
200- "HISTORICAL" if about past events (older than 1 year)
201- "CURRENT" if about recent events (within past year)
202- "UNCLEAR" if can't determine
204ONE WORD ONLY:"""
206 response = self.llm.invoke(prompt)
207 answer = (
208 remove_think_tags(
209 str(response.content)
210 if hasattr(response, "content")
211 else str(response)
212 )
213 .strip()
214 .upper()
215 )
217 # Reset to original parameters first
218 self.from_date = self._original_date_params["from_date"]
219 self.to_date = self._original_date_params["to_date"]
221 if "HISTORICAL" in answer:
222 # For historical queries, go back 10 years
223 logger.info(
224 "Query classified as HISTORICAL - extending search timeframe"
225 )
226 ten_years_ago = (
227 datetime.now(UTC) - timedelta(days=3650)
228 ).strftime("%Y-%m-%d")
229 self.from_date = ten_years_ago
231 elif "CURRENT" in answer:
232 # For current events, focus on recent content
233 logger.info(
234 "Query classified as CURRENT - focusing on recent content"
235 )
236 recent = (datetime.now(UTC) - timedelta(days=60)).strftime(
237 "%Y-%m-%d"
238 )
239 self.from_date = recent
240 self.order_by = "newest" # Prioritize newest for current events
242 except Exception:
243 logger.exception("Error adapting dates for query type")
244 # Keep original date parameters on error
246 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
247 """
248 Perform adaptive search that progressively adjusts parameters based on results.
250 Args:
251 query: The search query
253 Returns:
254 Tuple of (list of articles, search strategy used)
255 """
256 # Try with current parameters
257 articles = self._get_all_data(query)
258 strategy = "initial"
260 # If no results or too few, try different strategies
261 if len(articles) < 3 and self.adaptive_search:
262 logger.info(
263 f"Initial search found only {len(articles)} results, trying alternative strategies"
264 )
266 # Try with expanded date range
267 original_from_date = self.from_date
268 original_order_by = self.order_by
270 # Strategy 1: Expand to 6 months
271 logger.info("Strategy 1: Expanding time range to 6 months")
272 six_months_ago = (datetime.now(UTC) - timedelta(days=180)).strftime(
273 "%Y-%m-%d"
274 )
275 self.from_date = six_months_ago
277 articles1 = self._get_all_data(query)
278 if len(articles1) > len(articles):
279 articles = articles1
280 strategy = "expanded_6mo"
282 # Strategy 2: Expand to all time and try relevance order
283 if len(articles) < 3:
284 logger.info(
285 "Strategy 2: Expanding to all time with relevance ordering"
286 )
287 self.from_date = "2000-01-01" # Effectively "all time"
288 self.order_by = "relevance"
290 articles2 = self._get_all_data(query)
291 if len(articles2) > len(articles):
292 articles = articles2
293 strategy = "all_time_relevance"
295 # Strategy 3: Try removing section constraints
296 if len(articles) < 3 and self.section:
297 logger.info("Strategy 3: Removing section constraint")
298 original_section = self.section
299 self.section = None
301 articles3 = self._get_all_data(query)
302 if len(articles3) > len(articles): 302 ↛ 307line 302 didn't jump to line 307 because the condition on line 302 was always true
303 articles = articles3
304 strategy = "no_section"
306 # Restore section setting
307 self.section = original_section
309 # Restore original settings
310 self.from_date = original_from_date
311 self.order_by = original_order_by
313 logger.info(
314 f"Adaptive search using strategy '{strategy}' found {len(articles)} results"
315 )
316 return articles, strategy
318 def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
319 """
320 Get all article data from The Guardian API in a single call.
321 Always requests all fields for simplicity.
323 Args:
324 query: The search query
326 Returns:
327 List of articles with all data
328 """
329 try:
330 # Ensure query is not empty
331 if not query or query.strip() == "":
332 query = "news"
333 logger.warning("Empty query provided, using 'news' as default")
335 # Ensure query is not too long for API
336 if len(query) > 100:
337 logger.warning(
338 f"Query too long for Guardian API ({len(query)} chars), truncating"
339 )
340 query = query[:100]
342 # Always request all fields for simplicity
343 # Ensure max_results is an integer to avoid comparison errors
344 page_size = min(
345 int(self.max_results) if self.max_results is not None else 10,
346 50,
347 )
349 # Log full parameters for debugging
350 logger.info(f"Guardian API search query: '{query}'")
351 logger.info(
352 f"Guardian API date range: {self.from_date} to {self.to_date}"
353 )
355 params = {
356 "q": query,
357 "api-key": self.api_key,
358 "from-date": self.from_date,
359 "to-date": self.to_date,
360 "order-by": self.order_by,
361 "page-size": page_size, # API maximum is 50
362 "show-fields": "headline,trailText,byline,body,publication",
363 "show-tags": "keyword",
364 }
366 # Add section filter if specified
367 if self.section:
368 params["section"] = self.section
370 # Log the complete request parameters (except API key)
371 log_params = params.copy()
372 log_params["api-key"] = "REDACTED"
373 logger.info(f"Guardian API request parameters: {log_params}")
375 # Apply rate limiting before request
376 self._last_wait_time = self.rate_tracker.apply_rate_limit(
377 self.engine_type
378 )
380 # Execute the API request
381 response = safe_get(self.api_url, params=params)
382 response.raise_for_status()
384 data = response.json()
386 # Extract results from the response
387 articles = data.get("response", {}).get("results", [])
388 logger.info(f"Guardian API returned {len(articles)} articles")
390 # Format results to include all data
391 formatted_articles = []
392 for i, article in enumerate(articles):
393 if i >= self.max_results:
394 break
396 fields = article.get("fields", {})
398 # Format the article with all fields
399 result = {
400 "id": article.get("id", ""),
401 "title": fields.get(
402 "headline", article.get("webTitle", "")
403 ),
404 "link": article.get("webUrl", ""),
405 "snippet": fields.get("trailText", ""),
406 "publication_date": article.get("webPublicationDate", ""),
407 "section": article.get("sectionName", ""),
408 "author": fields.get("byline", ""),
409 "content": fields.get("body", ""),
410 "full_content": fields.get("body", ""),
411 }
413 # Extract tags/keywords
414 tags = article.get("tags", [])
415 result["keywords"] = [
416 tag.get("webTitle", "")
417 for tag in tags
418 if tag.get("type") == "keyword"
419 ]
421 formatted_articles.append(result)
423 return formatted_articles
425 except RateLimitError:
426 raise
427 except Exception as e:
428 sanitized = self._sanitize_error_message(str(e))
429 logger.exception(
430 "Error getting data from The Guardian API: {}", sanitized
431 )
432 self._raise_if_rate_limit(e)
433 return []
435 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
436 """
437 Get preview information for Guardian articles with enhanced optimization.
439 Args:
440 query: The search query
442 Returns:
443 List of preview dictionaries
444 """
445 logger.info(
446 f"Getting articles from The Guardian API for query: {query}"
447 )
449 # Step 1: Optimize the query using LLM
450 optimized_query = self._optimize_query_for_guardian(query)
452 # Step 2: Adapt date parameters based on query type
453 self._adapt_dates_for_query_type(optimized_query)
455 # Step 3: Perform adaptive search
456 articles, strategy = self._adaptive_search(optimized_query)
458 # Store search metadata for debugging
459 self._search_metadata = {
460 "original_query": query,
461 "optimized_query": optimized_query,
462 "strategy": strategy,
463 "from_date": self.from_date,
464 "to_date": self.to_date,
465 "section": self.section,
466 "order_by": self.order_by,
467 }
469 # Store full articles for later use
470 self._full_articles = {a["id"]: a for a in articles}
472 # Return only preview fields for each article
473 previews = []
474 for article in articles:
475 preview = {
476 "id": article["id"],
477 "title": article["title"],
478 "link": article["link"],
479 "snippet": article["snippet"],
480 "publication_date": article["publication_date"],
481 "section": article["section"],
482 "author": article["author"],
483 "keywords": article.get("keywords", []),
484 }
485 previews.append(preview)
487 return previews
489 def _get_full_content(
490 self, relevant_items: List[Dict[str, Any]]
491 ) -> List[Dict[str, Any]]:
492 """
493 Get full content for the relevant Guardian articles.
494 Restores full content from the cached data.
496 Args:
497 relevant_items: List of relevant preview dictionaries
499 Returns:
500 List of result dictionaries with full content
501 """
502 logger.info(
503 f"Adding full content to {len(relevant_items)} relevant Guardian articles"
504 )
506 # Get full articles for relevant items
507 results = []
508 for item in relevant_items:
509 article_id = item.get("id", "")
511 # Get the full article from our cache
512 if (
513 hasattr(self, "_full_articles")
514 and article_id in self._full_articles
515 ):
516 results.append(self._full_articles[article_id])
517 else:
518 # If not found (shouldn't happen), just use the preview
519 results.append(item)
521 return results
523 def run(
524 self, query: str, research_context: Dict[str, Any] | None = None
525 ) -> List[Dict[str, Any]]:
526 """
527 Execute a search using The Guardian API with the enhanced approach.
529 Args:
530 query: The search query
531 research_context: Context from previous research to use.
533 Returns:
534 List of search results
535 """
536 logger.info("---Execute a search using The Guardian (enhanced)---")
538 # Additional safety check for None query
539 if query is None:
540 logger.error("None query passed to Guardian search engine")
541 query = "news"
543 try:
544 # Get previews with our enhanced method
545 previews = self._get_previews(query)
547 # If no results, try one more time with a simplified query
548 if not previews:
549 simple_query = " ".join(
550 [w for w in query.split() if len(w) > 3][:3]
551 )
552 logger.warning(
553 f"No Guardian articles found, trying simplified query: {simple_query}"
554 )
555 previews = self._get_previews(simple_query)
557 # If still no results, try with a very generic query as last resort
558 if not previews and "trump" in query.lower(): 558 ↛ 559line 558 didn't jump to line 559 because the condition on line 558 was never true
559 logger.warning("Trying last resort query: 'Donald Trump'")
560 previews = self._get_previews("Donald Trump")
561 elif not previews: 561 ↛ 566line 561 didn't jump to line 566 because the condition on line 561 was always true
562 logger.warning("Trying last resort query: 'news'")
563 previews = self._get_previews("news")
565 # If still no results after all attempts, return empty list
566 if not previews:
567 logger.warning(
568 "No Guardian articles found after multiple attempts"
569 )
570 return []
572 # Filter for relevance if we have an LLM
573 if ( 573 ↛ 578line 573 didn't jump to line 578 because the condition on line 573 was never true
574 self.llm
575 and hasattr(self, "max_filtered_results")
576 and self.max_filtered_results
577 ):
578 filtered_items = self._filter_for_relevance(previews, query)
579 if not filtered_items:
580 # Fall back to unfiltered results if everything was filtered out
581 logger.warning(
582 "All articles filtered out, using unfiltered results"
583 )
584 filtered_items = previews[: self.max_filtered_results]
585 else:
586 filtered_items = previews
588 # Get full content for relevant items
589 results = self._get_full_content(filtered_items)
591 # Add source information to make it clear these are from The Guardian
592 for result in results:
593 if "source" not in result:
594 result["source"] = "The Guardian"
596 # Clean up the cache after use
597 if hasattr(self, "_full_articles"):
598 del self._full_articles
600 # Restore original date parameters
601 self.from_date = self._original_date_params["from_date"]
602 self.to_date = self._original_date_params["to_date"]
604 # Log search metadata if available
605 if hasattr(self, "_search_metadata"):
606 logger.info(f"Search metadata: {self._search_metadata}")
607 del self._search_metadata
609 return results
611 except RateLimitError:
612 raise
613 except Exception:
614 logger.exception("Error in Guardian search")
616 # Restore original date parameters on error
617 self.from_date = self._original_date_params["from_date"]
618 self.to_date = self._original_date_params["to_date"]
620 return []
622 def search_by_section(
623 self, section: str, max_results: Optional[int] = None
624 ) -> List[Dict[str, Any]]:
625 """
626 Search for articles in a specific section.
628 Args:
629 section: The Guardian section name (e.g., "politics", "technology")
630 max_results: Maximum number of results (defaults to self.max_results)
632 Returns:
633 List of articles in the section
634 """
635 original_section = self.section
636 original_max_results = self.max_results
638 try:
639 # Set section and max_results for this search
640 self.section = section
641 if max_results: 641 ↛ 642line 641 didn't jump to line 642 because the condition on line 641 was never true
642 self.max_results = max_results
644 # Use empty query to get all articles in the section
645 return self.run("")
647 finally:
648 # Restore original values
649 self.section = original_section
650 self.max_results = original_max_results
652 def get_recent_articles(
653 self, days: int = 7, max_results: Optional[int] = None
654 ) -> List[Dict[str, Any]]:
655 """
656 Get recent articles from The Guardian.
658 Args:
659 days: Number of days to look back
660 max_results: Maximum number of results (defaults to self.max_results)
662 Returns:
663 List of recent articles
664 """
665 original_from_date = self.from_date
666 original_order_by = self.order_by
667 original_max_results = self.max_results
669 try:
670 # Set parameters for this search
671 self.from_date = (
672 datetime.now(UTC) - timedelta(days=days)
673 ).strftime("%Y-%m-%d")
674 self.order_by = "newest"
675 if max_results: 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true
676 self.max_results = max_results
678 # Use empty query to get all recent articles
679 return self.run("")
681 finally:
682 # Restore original values
683 self.from_date = original_from_date
684 self.order_by = original_order_by
685 self.max_results = original_max_results