Coverage for src / local_deep_research / advanced_search_system / filters / followup_relevance_filter.py: 11%
51 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Follow-up Relevance Filter
4Filters and ranks past research sources based on their relevance
5to follow-up questions.
6"""
8from typing import Dict, List
9from loguru import logger
11from .base_filter import BaseFilter
12from ...utilities.search_utilities import remove_think_tags
15class FollowUpRelevanceFilter(BaseFilter):
16 """
17 Filters past research sources by relevance to follow-up questions.
19 This filter analyzes sources from previous research and determines
20 which ones are most relevant to the new follow-up question.
21 """
23 def filter_results(
24 self, results: List[Dict], query: str, max_results: int = 10, **kwargs
25 ) -> List[Dict]:
26 """
27 Filter search results by relevance to the follow-up query.
29 Args:
30 results: List of source dictionaries from past research
31 query: The follow-up query
32 max_results: Maximum number of results to return (default: 10)
33 **kwargs: Additional parameters:
34 - past_findings: Summary of past findings for context
35 - original_query: The original research query
37 Returns:
38 Filtered list of relevant sources
39 """
40 if not results:
41 return []
43 past_findings = kwargs.get("past_findings", "")
44 original_query = kwargs.get("original_query", "")
46 # Use LLM to select relevant sources
47 relevant_indices = self._select_relevant_sources(
48 results, query, past_findings, max_results, original_query
49 )
51 # Return selected sources
52 filtered = [results[i] for i in relevant_indices if i < len(results)]
54 logger.info(
55 f"Filtered {len(results)} sources to {len(filtered)} relevant ones "
56 f"for follow-up query. Kept indices: {relevant_indices}"
57 )
59 return filtered
61 def _select_relevant_sources(
62 self,
63 sources: List[Dict],
64 query: str,
65 context: str,
66 max_results: int,
67 original_query: str = "",
68 ) -> List[int]:
69 """
70 Select relevant sources using LLM.
72 Args:
73 sources: List of source dictionaries
74 query: The follow-up query
75 context: Past findings context
76 max_results: Maximum number of sources to select
77 original_query: The original research query
79 Returns:
80 List of indices of relevant sources
81 """
82 if not self.model:
83 # If no model available, return first max_results
84 return list(range(min(max_results, len(sources))))
86 # Build source list for LLM
87 source_list = []
88 for i, source in enumerate(sources):
89 title = source.get("title") or "Unknown"
90 url = source.get("url") or ""
91 snippet = (
92 source.get("snippet") or source.get("content_preview") or ""
93 )[:150]
94 source_list.append(
95 f"{i}. {title}\n URL: {url}\n Content: {snippet}"
96 )
98 sources_text = "\n\n".join(source_list)
100 # Include context if available for better selection
101 context_section = ""
102 if context or original_query:
103 parts = []
104 if original_query:
105 parts.append(f"Original research question: {original_query}")
106 if context:
107 parts.append(f"Previous research findings:\n{context}")
109 context_section = f"""
110Previous Research Context:
111{chr(10).join(parts)}
113---
114"""
116 prompt = f"""
117Select the most relevant sources for answering this follow-up question based on the previous research context.
118{context_section}
119Follow-up question: "{query}"
121Available sources from previous research:
122{sources_text}
124Instructions:
125- Select sources that are most relevant to the follow-up question given the context
126- Consider which sources directly address the question or provide essential information
127- Think about what the user is asking for in relation to the previous findings
128- Return ONLY a JSON array of source numbers (e.g., [0, 2, 5, 7])
129- Do not include any explanation or other text
131Return the indices of relevant sources as a JSON array:"""
133 try:
134 response = self.model.invoke(prompt)
135 content = remove_think_tags(response.content).strip()
137 # Parse JSON response
138 import json
140 try:
141 indices = json.loads(content)
142 # Validate it's a list of integers
143 if not isinstance(indices, list):
144 raise ValueError("Response is not a list")
145 indices = [
146 int(i)
147 for i in indices
148 if isinstance(i, (int, float)) and int(i) < len(sources)
149 ]
151 except (json.JSONDecodeError, ValueError) as parse_error:
152 logger.debug(
153 f"Failed to parse JSON, attempting regex fallback: {parse_error}"
154 )
155 # Fallback to regex extraction
156 import re
158 numbers = re.findall(r"\d+", content)
159 indices = [int(n) for n in numbers if int(n) < len(sources)]
161 return indices
162 except Exception as e:
163 logger.debug(f"LLM source selection failed: {e}")
164 # Fallback to first max_results sources
165 return list(range(min(max_results, len(sources))))