Coverage for src/local_deep_research/advanced_search_system/filters/followup_relevance

1"""

2Follow-up Relevance Filter

4Filters and ranks past research sources based on their relevance

5to follow-up questions.

6"""

8from typing import Dict, List

9from loguru import logger

11from .base_filter import BaseFilter

12from ...utilities.json_utils import extract_json, get_llm_response_text

15class FollowUpRelevanceFilter(BaseFilter):

16 """

17 Filters past research sources by relevance to follow-up questions.

19 This filter analyzes sources from previous research and determines

20 which ones are most relevant to the new follow-up question.

21 """

23 def filter_results(

24 self, results: List[Dict], query: str, max_results: int = 10, **kwargs

25 ) -> List[Dict]:

26 """

27 Filter search results by relevance to the follow-up query.

29 Args:

30 results: List of source dictionaries from past research

31 query: The follow-up query

32 max_results: Maximum number of results to return (default: 10)

33 **kwargs: Additional parameters:

34 - past_findings: Summary of past findings for context

35 - original_query: The original research query

37 Returns:

38 Filtered list of relevant sources

39 """

40 if not results:

41 return []

43 past_findings = kwargs.get("past_findings", "")

44 original_query = kwargs.get("original_query", "")

46 # Use LLM to select relevant sources

47 relevant_indices = self._select_relevant_sources(

48 results, query, past_findings, max_results, original_query

49 )

51 # Return selected sources

52 filtered = [results[i] for i in relevant_indices if i < len(results)]

54 logger.info(

55 f"Filtered {len(results)} sources to {len(filtered)} relevant ones "

56 f"for follow-up query. Kept indices: {relevant_indices}"

57 )

59 return filtered

61 def _select_relevant_sources(

62 self,

63 sources: List[Dict],

64 query: str,

65 context: str,

66 max_results: int,

67 original_query: str = "",

68 ) -> List[int]:

69 """

70 Select relevant sources using LLM.

72 Args:

73 sources: List of source dictionaries

74 query: The follow-up query

75 context: Past findings context

76 max_results: Maximum number of sources to select

77 original_query: The original research query

79 Returns:

80 List of indices of relevant sources

81 """

82 if not self.model:

83 # If no model available, return first max_results

84 return list(range(min(max_results, len(sources))))

86 # Build source list for LLM

87 source_list = []

88 for i, source in enumerate(sources):

89 title = source.get("title") or "Unknown"

90 url = source.get("url") or ""

91 snippet = (

92 source.get("snippet") or source.get("content_preview") or ""

93 )[:150]

94 source_list.append(

95 f"{i}. {title}\n URL: {url}\n Content: {snippet}"

96 )

98 sources_text = "\n\n".join(source_list)

100 # Include context if available for better selection

101 context_section = ""

102 if context or original_query:

103 parts = []

104 if original_query:

105 parts.append(f"Original research question: {original_query}")

106 if context:

107 parts.append(f"Previous research findings:\n{context}")

108

109 context_section = f"""

110Previous Research Context:

111{chr(10).join(parts)}

112

113---

114"""

115

116 prompt = f"""

117Select the most relevant sources for answering this follow-up question based on the previous research context.

118{context_section}

119Follow-up question: "{query}"

120

121Available sources from previous research:

122{sources_text}

123

124Instructions:

125- Select sources that are most relevant to the follow-up question given the context

126- Consider which sources directly address the question or provide essential information

127- Think about what the user is asking for in relation to the previous findings

128- Return ONLY a JSON array of source numbers (e.g., [0, 2, 5, 7])

129- Do not include any explanation or other text

130

131Return the indices of relevant sources as a JSON array:"""

132

133 try:

134 response = self.model.invoke(prompt)

135 content = get_llm_response_text(response)

136

137 # Parse JSON response

138 indices = extract_json(content, expected_type=list)

139

140 if indices is not None:

141 # Validate it's a list of integers

142 indices = [

143 int(i)

144 for i in indices

145 if isinstance(i, (int, float)) and int(i) < len(sources)

146 ]

147 else:

148 logger.debug("Failed to parse JSON, attempting regex fallback")

149 # Fallback to regex extraction

150 import re

151

152 numbers = re.findall(r"\d+", content)

153 indices = [int(n) for n in numbers if int(n) < len(sources)]

154

155 return indices

156 except Exception as e:

157 logger.debug(f"LLM source selection failed: {e}")

158 # Fallback to first max_results sources

159 return list(range(min(max_results, len(sources))))

Coverage for src / local_deep_research / advanced_search_system / filters / followup_relevance_filter.py: 100%

47 statements