Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from datetime import datetime, timedelta, UTC

2from typing import Any, Dict, List, Optional, Tuple

4from langchain_core.language_models import BaseLLM

5from loguru import logger

7from ...config import search_config

8from ...config.search_config import get_setting_from_snapshot

9from ...utilities.search_utilities import remove_think_tags

10from ...security.safe_requests import safe_get

11from ..search_engine_base import BaseSearchEngine

14class GuardianSearchEngine(BaseSearchEngine):

15 """Enhanced Guardian API search engine implementation with LLM query optimization"""

17 # Mark as public search engine

18 is_public = True

20 def __init__(

21 self,

22 max_results: int = 10,

23 api_key: Optional[str] = None,

24 from_date: Optional[str] = None,

25 to_date: Optional[str] = None,

26 section: Optional[str] = None,

27 order_by: str = "relevance",

28 llm: Optional[BaseLLM] = None,

29 max_filtered_results: Optional[int] = None,

30 optimize_queries: bool = True,

31 adaptive_search: bool = True,

32 **kwargs,

33 ):

34 """

35 Initialize The Guardian search engine with enhanced features.

37 Args:

38 max_results: Maximum number of search results

39 api_key: The Guardian API key (can also be set in GUARDIAN_API_KEY env)

40 from_date: Start date for search (YYYY-MM-DD format, default 1 month ago)

41 to_date: End date for search (YYYY-MM-DD format, default today)

42 section: Filter by section (e.g., "politics", "technology", "sport")

43 order_by: Sort order ("relevance", "newest", "oldest")

44 llm: Language model for relevance filtering and query optimization

45 max_filtered_results: Maximum number of results to keep after filtering

46 optimize_queries: Whether to optimize queries using LLM

47 adaptive_search: Whether to use adaptive search (adjusting date ranges)

48 """

49 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

50 super().__init__(

51 llm=llm,

52 max_filtered_results=max_filtered_results,

53 max_results=max_results,

54 )

55 # Get API key - check params or database

56 guardian_api_key = api_key

57 if not guardian_api_key:

58 guardian_api_key = get_setting_from_snapshot(

59 "search.engine.web.guardian.api_key",

60 settings_snapshot=kwargs.get("settings_snapshot"),

61 )

62 self.api_key = guardian_api_key

63 self.optimize_queries = optimize_queries

64 self.adaptive_search = adaptive_search

66 if not self.api_key:

67 raise ValueError(

68 "Guardian API key not found. Please provide api_key parameter or set it in the UI settings."

69 )

71 # Set date ranges if not provided

72 if not from_date:

73 # Default to one month ago

74 one_month_ago = datetime.now(UTC) - timedelta(days=30)

75 self.from_date = one_month_ago.strftime("%Y-%m-%d")

76 else:

77 self.from_date = from_date

79 if not to_date:

80 # Default to today

81 self.to_date = datetime.now(UTC).strftime("%Y-%m-%d")

82 else:

83 self.to_date = to_date

85 self.section = section

86 self.order_by = order_by

87 self._original_date_params = {

88 "from_date": self.from_date,

89 "to_date": self.to_date,

90 }

92 # API base URL

93 self.api_url = "https://content.guardianapis.com/search"

95 def _optimize_query_for_guardian(self, query: str) -> str:

96 """

97 Optimize a natural language query for Guardian search.

98 Uses LLM to transform questions into effective news search queries.

100 Args:

101 query: Natural language query

102

103 Returns:

104 Optimized query string for Guardian

105 """

106 # Handle extremely long queries by truncating first

107 if len(query) > 150:

108 simple_query = " ".join(query.split()[:10])

109 logger.info(

110 f"Query too long ({len(query)} chars), truncating to: {simple_query}"

111 )

112 query = simple_query

113

114 if not self.llm or not self.optimize_queries:

115 # Return original query if no LLM available or optimization disabled

116 return query

117

118 try:

119 # Prompt for query optimization

120 prompt = f"""Transform this natural language question into a very short Guardian news search query.

121

122Original query: "{query}"

123

124CRITICAL RULES:

1251. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS

1262. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total

1273. Focus only on the main topic/person/event

1284. Include proper names when relevant

1295. Remove ALL unnecessary words

1306. DO NOT use Boolean operators (no AND/OR)

1317. DO NOT use quotes

132

133EXAMPLE CONVERSIONS:

134✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates"

135✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations"

136✓ "How are tech companies responding to AI regulation?" → "tech AI regulation"

137✓ "What is Donald Trump's current political activity?" → "Trump political activity"

138

139Return ONLY the extremely brief search query.

140"""

141

142 # Get response from LLM

143 response = self.llm.invoke(prompt)

144 optimized_query = remove_think_tags(response.content).strip()

145

146 # Clean up the query - remove any explanations

147 lines = optimized_query.split("\n")

148 for line in lines: 148 ↛ 157line 148 didn't jump to line 157 because the loop on line 148 didn't complete

149 line = line.strip()

150 if line and not line.lower().startswith( 150 ↛ 148line 150 didn't jump to line 148 because the condition on line 150 was always true

151 ("here", "i would", "the best", "this query")

152 ):

153 optimized_query = line

154 break

155

156 # Remove any quotes that wrap the entire query

157 if ( 157 ↛ 162line 157 didn't jump to line 162 because the condition on line 157 was never true

158 optimized_query.startswith('"')

159 and optimized_query.endswith('"')

160 and optimized_query.count('"') == 2

161 ):

162 optimized_query = optimized_query[1:-1]

163

164 logger.info(f"Original query: '{query}'")

165 logger.info(f"Optimized for Guardian: '{optimized_query}'")

166

167 return optimized_query

168

169 except Exception:

170 logger.exception("Error optimizing query")

171 return query # Fall back to original query on error

172

173 def _adapt_dates_for_query_type(self, query: str) -> None:

174 """

175 Adapt date range based on query type (historical vs current).

176

177 Args:

178 query: The search query

179 """

180 # Fast path - for very short queries, default to recent news

181 if len(query.split()) <= 4:

182 logger.info("Short query detected, defaulting to recent news")

183 # Default to 60 days for short queries

184 recent = (datetime.now(UTC) - timedelta(days=60)).strftime(

185 "%Y-%m-%d"

186 )

187 self.from_date = recent

188 self.order_by = "newest"

189 return

190

191 if not self.llm or not self.adaptive_search: 191 ↛ 194line 191 didn't jump to line 194 because the condition on line 191 was always true

192 return

193

194 try:

195 prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?

196

197Query: "{query}"

198

199ONE WORD ANSWER ONLY:

200- "HISTORICAL" if about past events (older than 1 year)

201- "CURRENT" if about recent events (within past year)

202- "UNCLEAR" if can't determine

203

204ONE WORD ONLY:"""

205

206 response = self.llm.invoke(prompt)

207 answer = remove_think_tags(response.content).strip().upper()

208

209 # Reset to original parameters first

210 self.from_date = self._original_date_params["from_date"]

211 self.to_date = self._original_date_params["to_date"]

212

213 if "HISTORICAL" in answer:

214 # For historical queries, go back 10 years

215 logger.info(

216 "Query classified as HISTORICAL - extending search timeframe"

217 )

218 ten_years_ago = (

219 datetime.now(UTC) - timedelta(days=3650)

220 ).strftime("%Y-%m-%d")

221 self.from_date = ten_years_ago

222

223 elif "CURRENT" in answer:

224 # For current events, focus on recent content

225 logger.info(

226 "Query classified as CURRENT - focusing on recent content"

227 )

228 recent = (datetime.now(UTC) - timedelta(days=60)).strftime(

229 "%Y-%m-%d"

230 )

231 self.from_date = recent

232 self.order_by = "newest" # Prioritize newest for current events

233

234 except Exception:

235 logger.exception("Error adapting dates for query type")

236 # Keep original date parameters on error

237

238 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:

239 """

240 Perform adaptive search that progressively adjusts parameters based on results.

241

242 Args:

243 query: The search query

244

245 Returns:

246 Tuple of (list of articles, search strategy used)

247 """

248 # Try with current parameters

249 articles = self._get_all_data(query)

250 strategy = "initial"

251

252 # If no results or too few, try different strategies

253 if len(articles) < 3 and self.adaptive_search: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 logger.info(

255 f"Initial search found only {len(articles)} results, trying alternative strategies"

256 )

257

258 # Try with expanded date range

259 original_from_date = self.from_date

260 original_order_by = self.order_by

261

262 # Strategy 1: Expand to 6 months

263 logger.info("Strategy 1: Expanding time range to 6 months")

264 six_months_ago = (datetime.now(UTC) - timedelta(days=180)).strftime(

265 "%Y-%m-%d"

266 )

267 self.from_date = six_months_ago

268

269 articles1 = self._get_all_data(query)

270 if len(articles1) > len(articles):

271 articles = articles1

272 strategy = "expanded_6mo"

273

274 # Strategy 2: Expand to all time and try relevance order

275 if len(articles) < 3:

276 logger.info(

277 "Strategy 2: Expanding to all time with relevance ordering"

278 )

279 self.from_date = "2000-01-01" # Effectively "all time"

280 self.order_by = "relevance"

281

282 articles2 = self._get_all_data(query)

283 if len(articles2) > len(articles):

284 articles = articles2

285 strategy = "all_time_relevance"

286

287 # Strategy 3: Try removing section constraints

288 if len(articles) < 3 and self.section:

289 logger.info("Strategy 3: Removing section constraint")

290 original_section = self.section

291 self.section = None

292

293 articles3 = self._get_all_data(query)

294 if len(articles3) > len(articles):

295 articles = articles3

296 strategy = "no_section"

297

298 # Restore section setting

299 self.section = original_section

300

301 # Restore original settings

302 self.from_date = original_from_date

303 self.order_by = original_order_by

304

305 logger.info(

306 f"Adaptive search using strategy '{strategy}' found {len(articles)} results"

307 )

308 return articles, strategy

309

310 def _get_all_data(self, query: str) -> List[Dict[str, Any]]:

311 """

312 Get all article data from The Guardian API in a single call.

313 Always requests all fields for simplicity.

314

315 Args:

316 query: The search query

317

318 Returns:

319 List of articles with all data

320 """

321 try:

322 # Ensure query is not empty

323 if not query or query.strip() == "":

324 query = "news"

325 logger.warning("Empty query provided, using 'news' as default")

326

327 # Ensure query is not too long for API

328 if len(query) > 100: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true

329 logger.warning(

330 f"Query too long for Guardian API ({len(query)} chars), truncating"

331 )

332 query = query[:100]

333

334 # Always request all fields for simplicity

335 # Ensure max_results is an integer to avoid comparison errors

336 page_size = min(

337 int(self.max_results) if self.max_results is not None else 10,

338 50,

339 )

340

341 # Log full parameters for debugging

342 logger.info(f"Guardian API search query: '{query}'")

343 logger.info(

344 f"Guardian API date range: {self.from_date} to {self.to_date}"

345 )

346

347 params = {

348 "q": query,

349 "api-key": self.api_key,

350 "from-date": self.from_date,

351 "to-date": self.to_date,

352 "order-by": self.order_by,

353 "page-size": page_size, # API maximum is 50

354 "show-fields": "headline,trailText,byline,body,publication",

355 "show-tags": "keyword",

356 }

357

358 # Add section filter if specified

359 if self.section: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 params["section"] = self.section

361

362 # Log the complete request parameters (except API key)

363 log_params = params.copy()

364 log_params["api-key"] = "REDACTED"

365 logger.info(f"Guardian API request parameters: {log_params}")

366

367 # Apply rate limiting before request

368 self._last_wait_time = self.rate_tracker.apply_rate_limit(

369 self.engine_type

370 )

371

372 # Execute the API request

373 response = safe_get(self.api_url, params=params)

374 response.raise_for_status()

375

376 data = response.json()

377

378 # Extract results from the response

379 articles = data.get("response", {}).get("results", [])

380 logger.info(f"Guardian API returned {len(articles)} articles")

381

382 # Format results to include all data

383 formatted_articles = []

384 for i, article in enumerate(articles):

385 if i >= self.max_results: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true

386 break

387

388 fields = article.get("fields", {})

389

390 # Format the article with all fields

391 result = {

392 "id": article.get("id", ""),

393 "title": fields.get(

394 "headline", article.get("webTitle", "")

395 ),

396 "link": article.get("webUrl", ""),

397 "snippet": fields.get("trailText", ""),

398 "publication_date": article.get("webPublicationDate", ""),

399 "section": article.get("sectionName", ""),

400 "author": fields.get("byline", ""),

401 "content": fields.get("body", ""),

402 "full_content": fields.get("body", ""),

403 }

404

405 # Extract tags/keywords

406 tags = article.get("tags", [])

407 result["keywords"] = [

408 tag.get("webTitle", "")

409 for tag in tags

410 if tag.get("type") == "keyword"

411 ]

412

413 formatted_articles.append(result)

414

415 return formatted_articles

416

417 except Exception:

418 logger.exception("Error getting data from The Guardian API")

419 return []

420

421 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

422 """

423 Get preview information for Guardian articles with enhanced optimization.

424

425 Args:

426 query: The search query

427

428 Returns:

429 List of preview dictionaries

430 """

431 logger.info(

432 f"Getting articles from The Guardian API for query: {query}"

433 )

434

435 # Step 1: Optimize the query using LLM

436 optimized_query = self._optimize_query_for_guardian(query)

437

438 # Step 2: Adapt date parameters based on query type

439 self._adapt_dates_for_query_type(optimized_query)

440

441 # Step 3: Perform adaptive search

442 articles, strategy = self._adaptive_search(optimized_query)

443

444 # Store search metadata for debugging

445 self._search_metadata = {

446 "original_query": query,

447 "optimized_query": optimized_query,

448 "strategy": strategy,

449 "from_date": self.from_date,

450 "to_date": self.to_date,

451 "section": self.section,

452 "order_by": self.order_by,

453 }

454

455 # Store full articles for later use

456 self._full_articles = {a["id"]: a for a in articles}

457

458 # Return only preview fields for each article

459 previews = []

460 for article in articles:

461 preview = {

462 "id": article["id"],

463 "title": article["title"],

464 "link": article["link"],

465 "snippet": article["snippet"],

466 "publication_date": article["publication_date"],

467 "section": article["section"],

468 "author": article["author"],

469 "keywords": article.get("keywords", []),

470 }

471 previews.append(preview)

472

473 return previews

474

475 def _get_full_content(

476 self, relevant_items: List[Dict[str, Any]]

477 ) -> List[Dict[str, Any]]:

478 """

479 Get full content for the relevant Guardian articles.

480 Restores full content from the cached data.

481

482 Args:

483 relevant_items: List of relevant preview dictionaries

484

485 Returns:

486 List of result dictionaries with full content

487 """

488 logger.info(

489 f"Adding full content to {len(relevant_items)} relevant Guardian articles"

490 )

491

492 # Check if we should add full content

493 if ( 493 ↛ 497line 493 didn't jump to line 497 because the condition on line 493 was never true

494 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

495 and search_config.SEARCH_SNIPPETS_ONLY

496 ):

497 return relevant_items

498

499 # Get full articles for relevant items

500 results = []

501 for item in relevant_items:

502 article_id = item.get("id", "")

503

504 # Get the full article from our cache

505 if (

506 hasattr(self, "_full_articles")

507 and article_id in self._full_articles

508 ):

509 results.append(self._full_articles[article_id])

510 else:

511 # If not found (shouldn't happen), just use the preview

512 results.append(item)

513

514 return results

515

516 def run(

517 self, query: str, research_context: Dict[str, Any] | None = None

518 ) -> List[Dict[str, Any]]:

519 """

520 Execute a search using The Guardian API with the enhanced approach.

521

522 Args:

523 query: The search query

524 research_context: Context from previous research to use.

525

526 Returns:

527 List of search results

528 """

529 logger.info("---Execute a search using The Guardian (enhanced)---")

530

531 # Additional safety check for None query

532 if query is None:

533 logger.error("None query passed to Guardian search engine")

534 query = "news"

535

536 try:

537 # Get previews with our enhanced method

538 previews = self._get_previews(query)

539

540 # If no results, try one more time with a simplified query

541 if not previews:

542 simple_query = " ".join(

543 [w for w in query.split() if len(w) > 3][:3]

544 )

545 logger.warning(

546 f"No Guardian articles found, trying simplified query: {simple_query}"

547 )

548 previews = self._get_previews(simple_query)

549

550 # If still no results, try with a very generic query as last resort

551 if not previews and "trump" in query.lower(): 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true

552 logger.warning("Trying last resort query: 'Donald Trump'")

553 previews = self._get_previews("Donald Trump")

554 elif not previews: 554 ↛ 559line 554 didn't jump to line 559 because the condition on line 554 was always true

555 logger.warning("Trying last resort query: 'news'")

556 previews = self._get_previews("news")

557

558 # If still no results after all attempts, return empty list

559 if not previews:

560 logger.warning(

561 "No Guardian articles found after multiple attempts"

562 )

563 return []

564

565 # Filter for relevance if we have an LLM

566 if ( 566 ↛ 571line 566 didn't jump to line 571 because the condition on line 566 was never true

567 self.llm

568 and hasattr(self, "max_filtered_results")

569 and self.max_filtered_results

570 ):

571 filtered_items = self._filter_for_relevance(previews, query)

572 if not filtered_items:

573 # Fall back to unfiltered results if everything was filtered out

574 logger.warning(

575 "All articles filtered out, using unfiltered results"

576 )

577 filtered_items = previews[: self.max_filtered_results]

578 else:

579 filtered_items = previews

580

581 # Get full content for relevant items

582 results = self._get_full_content(filtered_items)

583

584 # Add source information to make it clear these are from The Guardian

585 for result in results:

586 if "source" not in result: 586 ↛ 585line 586 didn't jump to line 585 because the condition on line 586 was always true

587 result["source"] = "The Guardian"

588

589 # Clean up the cache after use

590 if hasattr(self, "_full_articles"): 590 ↛ 591line 590 didn't jump to line 591 because the condition on line 590 was never true

591 del self._full_articles

592

593 # Restore original date parameters

594 self.from_date = self._original_date_params["from_date"]

595 self.to_date = self._original_date_params["to_date"]

596

597 # Log search metadata if available

598 if hasattr(self, "_search_metadata"): 598 ↛ 599line 598 didn't jump to line 599 because the condition on line 598 was never true

599 logger.info(f"Search metadata: {self._search_metadata}")

600 del self._search_metadata

601

602 return results

603

604 except Exception:

605 logger.exception("Error in Guardian search")

606

607 # Restore original date parameters on error

608 self.from_date = self._original_date_params["from_date"]

609 self.to_date = self._original_date_params["to_date"]

610

611 return []

612

613 def search_by_section(

614 self, section: str, max_results: Optional[int] = None

615 ) -> List[Dict[str, Any]]:

616 """

617 Search for articles in a specific section.

618

619 Args:

620 section: The Guardian section name (e.g., "politics", "technology")

621 max_results: Maximum number of results (defaults to self.max_results)

622

623 Returns:

624 List of articles in the section

625 """

626 original_section = self.section

627 original_max_results = self.max_results

628

629 try:

630 # Set section and max_results for this search

631 self.section = section

632 if max_results: 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true

633 self.max_results = max_results

634

635 # Use empty query to get all articles in the section

636 return self.run("")

637

638 finally:

639 # Restore original values

640 self.section = original_section

641 self.max_results = original_max_results

642

643 def get_recent_articles(

644 self, days: int = 7, max_results: Optional[int] = None

645 ) -> List[Dict[str, Any]]:

646 """

647 Get recent articles from The Guardian.

648

649 Args:

650 days: Number of days to look back

651 max_results: Maximum number of results (defaults to self.max_results)

652

653 Returns:

654 List of recent articles

655 """

656 original_from_date = self.from_date

657 original_order_by = self.order_by

658 original_max_results = self.max_results

659

660 try:

661 # Set parameters for this search

662 self.from_date = (

663 datetime.now(UTC) - timedelta(days=days)

664 ).strftime("%Y-%m-%d")

665 self.order_by = "newest"

666 if max_results: 666 ↛ 667line 666 didn't jump to line 667 because the condition on line 666 was never true

667 self.max_results = max_results

668

669 # Use empty query to get all recent articles

670 return self.run("")

671

672 finally:

673 # Restore original values

674 self.from_date = original_from_date

675 self.order_by = original_order_by

676 self.max_results = original_max_results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_guardian.py: 68%

243 statements