Coverage for src / local_deep_research / advanced_search_system / questions / entity_aware_question.py: 98%

48 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Entity-aware question generation for improved entity identification. 

3""" 

4 

5from datetime import datetime, UTC 

6 

7from loguru import logger 

8 

9from .base_question import BaseQuestionGenerator 

10 

11 

12class EntityAwareQuestionGenerator(BaseQuestionGenerator): 

13 """Question generator that creates more targeted searches for entity identification.""" 

14 

15 def generate_questions( 

16 self, 

17 current_knowledge: str, 

18 query: str, 

19 questions_per_iteration: int = 2, 

20 questions_by_iteration: dict[int, list[str]] | None = None, 

21 ) -> list[str]: 

22 """Generate questions with entity-aware search patterns.""" 

23 now = datetime.now(UTC) 

24 current_time = now.strftime("%Y-%m-%d") 

25 questions_by_iteration = questions_by_iteration or {} 

26 

27 logger.info("Generating entity-aware follow-up questions...") 

28 

29 # Detect if this is likely an entity identification query 

30 entity_keywords = [ 

31 "who", 

32 "what", 

33 "which", 

34 "identify", 

35 "name", 

36 "character", 

37 "person", 

38 "place", 

39 "organization", 

40 "company", 

41 "author", 

42 "scientist", 

43 "inventor", 

44 "city", 

45 "country", 

46 "book", 

47 "movie", 

48 ] 

49 

50 is_entity_query = any( 

51 keyword in query.lower() for keyword in entity_keywords 

52 ) 

53 

54 if is_entity_query: 

55 # Use more direct entity-focused prompt 

56 if questions_by_iteration: 

57 prompt = f"""Generate {questions_per_iteration} targeted search queries to identify the specific entity in the query. 

58 

59Query: {query} 

60Today: {current_time} 

61Past questions: {questions_by_iteration!s} 

62Current knowledge: {current_knowledge} 

63 

64Create direct search queries that combine the key identifying features to find the specific name/entity. 

65Focus on: 

661. Combining multiple constraints in a single search 

672. Using quotation marks for exact phrases 

683. Including specific details that narrow down results 

69 

70Format: One question per line, e.g. 

71Q: "fictional character" "breaks fourth wall" "TV show" 1960s 1980s 

72Q: character name ascetics humor television fewer than 50 episodes 

73""" 

74 else: 

75 prompt = f"""Generate {questions_per_iteration} direct search queries to identify the specific entity in: {query} 

76 

77Today: {current_time} 

78 

79Create search queries that: 

801. Combine multiple identifying features 

812. Target the specific entity name/identification 

823. Use variations of key terms 

83 

84Format: One question per line, e.g. 

85Q: question1 

86Q: question2 

87""" 

88 else: 

89 # Fall back to empty list for non-entity queries 

90 # (base class method is abstract; subclasses handle their own generation) 

91 return [] 

92 

93 response = self.model.invoke(prompt) 

94 

95 # Handle both string responses and responses with .content attribute 

96 response_text = "" 

97 if hasattr(response, "content"): 

98 response_text = response.content 

99 else: 

100 response_text = str(response) 

101 

102 questions = [ 

103 q.replace("Q:", "").strip() 

104 for q in response_text.split("\n") 

105 if q.strip().startswith("Q:") 

106 ][:questions_per_iteration] 

107 

108 logger.info(f"Generated {len(questions)} entity-aware questions") 

109 

110 return questions 

111 

112 def generate_sub_questions( 

113 self, query: str, context: str = "" 

114 ) -> list[str]: 

115 """Generate sub-questions with entity focus when appropriate.""" 

116 # Check if this is an entity identification query 

117 entity_keywords = [ 

118 "who", 

119 "what", 

120 "which", 

121 "identify", 

122 "name", 

123 "character", 

124 "person", 

125 "place", 

126 "organization", 

127 "company", 

128 ] 

129 

130 is_entity_query = any( 

131 keyword in query.lower() for keyword in entity_keywords 

132 ) 

133 

134 if is_entity_query: 

135 prompt = f"""Break down this entity identification query into targeted sub-questions. 

136 

137Original Question: {query} 

138{context} 

139 

140Generate 2-5 sub-questions that will help identify the specific entity. 

141Focus on: 

1421. Combining constraints to narrow down results 

1432. Finding the actual name/identity 

1443. Verifying the entity matches all criteria 

145 

146Format your response as: 

1471. First sub-question 

1482. Second sub-question 

149... 

150 

151Only provide the numbered sub-questions.""" 

152 else: 

153 return [] 

154 

155 try: 

156 response = self.model.invoke(prompt) 

157 content = "" 

158 if hasattr(response, "content"): 

159 content = response.content 

160 else: 

161 content = str(response) 

162 

163 # Extract numbered questions 

164 questions = [] 

165 for line in content.strip().split("\n"): 

166 line = line.strip() 

167 if line and (line[0].isdigit() or line.startswith("-")): 

168 # Remove the number/bullet and clean up 

169 question = line.split(".", 1)[-1].strip() 

170 question = question.lstrip("- ").strip() 

171 if question: 171 ↛ 165line 171 didn't jump to line 165 because the condition on line 171 was always true

172 questions.append(question) 

173 

174 return questions 

175 

176 except Exception: 

177 logger.exception("Error generating sub-questions") 

178 return []