Coverage for src / local_deep_research / advanced_search_system / filters / followup_relevance_filter.py: 11%

51 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Follow-up Relevance Filter 

3 

4Filters and ranks past research sources based on their relevance 

5to follow-up questions. 

6""" 

7 

8from typing import Dict, List 

9from loguru import logger 

10 

11from .base_filter import BaseFilter 

12from ...utilities.search_utilities import remove_think_tags 

13 

14 

15class FollowUpRelevanceFilter(BaseFilter): 

16 """ 

17 Filters past research sources by relevance to follow-up questions. 

18 

19 This filter analyzes sources from previous research and determines 

20 which ones are most relevant to the new follow-up question. 

21 """ 

22 

23 def filter_results( 

24 self, results: List[Dict], query: str, max_results: int = 10, **kwargs 

25 ) -> List[Dict]: 

26 """ 

27 Filter search results by relevance to the follow-up query. 

28 

29 Args: 

30 results: List of source dictionaries from past research 

31 query: The follow-up query 

32 max_results: Maximum number of results to return (default: 10) 

33 **kwargs: Additional parameters: 

34 - past_findings: Summary of past findings for context 

35 - original_query: The original research query 

36 

37 Returns: 

38 Filtered list of relevant sources 

39 """ 

40 if not results: 

41 return [] 

42 

43 past_findings = kwargs.get("past_findings", "") 

44 original_query = kwargs.get("original_query", "") 

45 

46 # Use LLM to select relevant sources 

47 relevant_indices = self._select_relevant_sources( 

48 results, query, past_findings, max_results, original_query 

49 ) 

50 

51 # Return selected sources 

52 filtered = [results[i] for i in relevant_indices if i < len(results)] 

53 

54 logger.info( 

55 f"Filtered {len(results)} sources to {len(filtered)} relevant ones " 

56 f"for follow-up query. Kept indices: {relevant_indices}" 

57 ) 

58 

59 return filtered 

60 

61 def _select_relevant_sources( 

62 self, 

63 sources: List[Dict], 

64 query: str, 

65 context: str, 

66 max_results: int, 

67 original_query: str = "", 

68 ) -> List[int]: 

69 """ 

70 Select relevant sources using LLM. 

71 

72 Args: 

73 sources: List of source dictionaries 

74 query: The follow-up query 

75 context: Past findings context 

76 max_results: Maximum number of sources to select 

77 original_query: The original research query 

78 

79 Returns: 

80 List of indices of relevant sources 

81 """ 

82 if not self.model: 

83 # If no model available, return first max_results 

84 return list(range(min(max_results, len(sources)))) 

85 

86 # Build source list for LLM 

87 source_list = [] 

88 for i, source in enumerate(sources): 

89 title = source.get("title") or "Unknown" 

90 url = source.get("url") or "" 

91 snippet = ( 

92 source.get("snippet") or source.get("content_preview") or "" 

93 )[:150] 

94 source_list.append( 

95 f"{i}. {title}\n URL: {url}\n Content: {snippet}" 

96 ) 

97 

98 sources_text = "\n\n".join(source_list) 

99 

100 # Include context if available for better selection 

101 context_section = "" 

102 if context or original_query: 

103 parts = [] 

104 if original_query: 

105 parts.append(f"Original research question: {original_query}") 

106 if context: 

107 parts.append(f"Previous research findings:\n{context}") 

108 

109 context_section = f""" 

110Previous Research Context: 

111{chr(10).join(parts)} 

112 

113--- 

114""" 

115 

116 prompt = f""" 

117Select the most relevant sources for answering this follow-up question based on the previous research context. 

118{context_section} 

119Follow-up question: "{query}" 

120 

121Available sources from previous research: 

122{sources_text} 

123 

124Instructions: 

125- Select sources that are most relevant to the follow-up question given the context 

126- Consider which sources directly address the question or provide essential information 

127- Think about what the user is asking for in relation to the previous findings 

128- Return ONLY a JSON array of source numbers (e.g., [0, 2, 5, 7]) 

129- Do not include any explanation or other text 

130 

131Return the indices of relevant sources as a JSON array:""" 

132 

133 try: 

134 response = self.model.invoke(prompt) 

135 content = remove_think_tags(response.content).strip() 

136 

137 # Parse JSON response 

138 import json 

139 

140 try: 

141 indices = json.loads(content) 

142 # Validate it's a list of integers 

143 if not isinstance(indices, list): 

144 raise ValueError("Response is not a list") 

145 indices = [ 

146 int(i) 

147 for i in indices 

148 if isinstance(i, (int, float)) and int(i) < len(sources) 

149 ] 

150 

151 except (json.JSONDecodeError, ValueError) as parse_error: 

152 logger.debug( 

153 f"Failed to parse JSON, attempting regex fallback: {parse_error}" 

154 ) 

155 # Fallback to regex extraction 

156 import re 

157 

158 numbers = re.findall(r"\d+", content) 

159 indices = [int(n) for n in numbers if int(n) < len(sources)] 

160 

161 return indices 

162 except Exception as e: 

163 logger.debug(f"LLM source selection failed: {e}") 

164 # Fallback to first max_results sources 

165 return list(range(min(max_results, len(sources))))