Coverage for src / local_deep_research / advanced_search_system / questions / decomposition_question.py: 4%

133 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from typing import List 

2 

3from langchain_core.language_models import BaseLLM 

4from loguru import logger 

5 

6from .base_question import BaseQuestionGenerator 

7 

8 

9class DecompositionQuestionGenerator(BaseQuestionGenerator): 

10 """Question generator for decomposing complex queries into sub-queries.""" 

11 

12 def __init__(self, model: BaseLLM, max_subqueries: int = 5): 

13 """ 

14 Initialize the question generator. 

15 

16 Args: 

17 model: The language model to use for question generation 

18 max_subqueries: Maximum number of sub-queries to generate 

19 """ 

20 super().__init__(model) 

21 self.max_subqueries = max_subqueries 

22 

23 def generate_questions( 

24 self, 

25 query: str, 

26 context: str, 

27 **kwargs, 

28 ) -> List[str]: 

29 """ 

30 Generate sub-queries by decomposing the original query. 

31 

32 Args: 

33 query: The main research query 

34 context: Additional context for question generation 

35 **kwargs: Additional keyword arguments 

36 

37 Returns: 

38 List of generated sub-queries 

39 """ 

40 # Extract subject if the query is in question format 

41 subject = query 

42 lower_query = query.lower() 

43 

44 if lower_query.endswith("?"): 

45 # Handle question-format queries by extracting the subject 

46 question_prefixes = [ 

47 "what is", 

48 "what are", 

49 "how does", 

50 "how do", 

51 "how can", 

52 "why is", 

53 "why are", 

54 "when did", 

55 "where is", 

56 "which", 

57 "who is", 

58 "can", 

59 "will", 

60 ] 

61 

62 # Remove the question mark 

63 subject_candidate = query[:-1].strip() 

64 

65 # Check for common question beginnings and extract the subject 

66 for prefix in question_prefixes: 

67 if lower_query.startswith(prefix): 

68 # Extract everything after the question prefix 

69 subject_candidate = query[len(prefix) :].strip() 

70 # Remove trailing ? if present 

71 if subject_candidate.endswith("?"): 

72 subject_candidate = subject_candidate[:-1].strip() 

73 subject = subject_candidate 

74 break 

75 

76 # For compound questions, extract just the primary subject 

77 conjunctions = [ 

78 " and ", 

79 " or ", 

80 " but ", 

81 " as ", 

82 " that ", 

83 " which ", 

84 " when ", 

85 " where ", 

86 " how ", 

87 ] 

88 for conjunction in conjunctions: 

89 if conjunction in subject.lower(): 

90 # Take only the part before the conjunction 

91 subject = subject.split(conjunction)[0].strip() 

92 logger.info( 

93 f"Split compound question at '{conjunction}', extracted: '{subject}'" 

94 ) 

95 break 

96 

97 # Clean up the subject if it starts with articles 

98 for article in ["a ", "an ", "the "]: 

99 if subject.lower().startswith(article): 

100 subject = subject[len(article) :].strip() 

101 

102 logger.info( 

103 f"Original query: '{query}', Extracted subject: '{subject}'" 

104 ) 

105 

106 # Create a prompt to decompose the query into sub-questions 

107 prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently. 

108Focus on breaking down complex concepts and identifying key aspects requiring separate investigation. 

109Ensure sub-queries are clear, targeted, and help build a comprehensive understanding. 

110 

111Main Research Topic: {subject} 

112Original Query: {query} 

113 

114Context Information: 

115{context[:2000]} # Limit context length to prevent token limit issues 

116 

117Your task is to create 3-5 specific questions that will help thoroughly research this topic. 

118If the original query is already a question, extract the core subject and formulate questions around that subject. 

119 

120Return ONLY the sub-queries, one per line, without numbering or bullet points. 

121Example format: 

122What is X technology? 

123How does X compare to Y? 

124What are the security implications of X? 

125""" 

126 

127 logger.info( 

128 f"Generating sub-questions for query: '{query}', subject: '{subject}'" 

129 ) 

130 

131 try: 

132 # Get response from LLM 

133 response = self.model.invoke(prompt) 

134 

135 # Handle different response formats (string or object with content attribute) 

136 sub_queries_text = "" 

137 if hasattr(response, "content"): 

138 sub_queries_text = response.content.strip() 

139 else: 

140 # Handle string responses 

141 sub_queries_text = str(response).strip() 

142 

143 # Check for the common "No language models available" error 

144 if ( 

145 "No language models are available" in sub_queries_text 

146 or "Please install Ollama" in sub_queries_text 

147 ): 

148 logger.warning( 

149 "LLM returned error about language models not being available, using default questions" 

150 ) 

151 # Create topic-specific default questions based on the query 

152 return self._generate_default_questions(query) 

153 

154 # Extract sub-queries (one per line) 

155 sub_queries = [] 

156 for line in sub_queries_text.split("\n"): 

157 line = line.strip() 

158 # Skip empty lines and lines that are just formatting (bullets, numbers) 

159 if ( 

160 not line 

161 or line in ["*", "-", "•"] 

162 or line.startswith(("- ", "* ", "• ", "1. ", "2. ", "3. ")) 

163 ): 

164 continue 

165 

166 # Remove any leading bullets or numbers if they exist 

167 clean_line = line 

168 for prefix in [ 

169 "- ", 

170 "* ", 

171 "• ", 

172 "1. ", 

173 "2. ", 

174 "3. ", 

175 "4. ", 

176 "5. ", 

177 "- ", 

178 "#", 

179 ]: 

180 if clean_line.startswith(prefix): 

181 clean_line = clean_line[len(prefix) :] 

182 

183 if ( 

184 clean_line and len(clean_line) > 10 

185 ): # Ensure it's a meaningful question 

186 sub_queries.append(clean_line) 

187 

188 # If no sub-queries were extracted, try again with a simpler prompt 

189 if not sub_queries: 

190 logger.warning( 

191 "No sub-queries extracted from first attempt, trying simplified approach" 

192 ) 

193 

194 # Determine if the query is already a question and extract the subject 

195 topic_text = query 

196 if query.lower().endswith("?"): 

197 # Try to extract subject from question 

198 for prefix in [ 

199 "what is", 

200 "what are", 

201 "how does", 

202 "how can", 

203 "why is", 

204 ]: 

205 if query.lower().startswith(prefix): 

206 topic_text = query[len(prefix) :].strip() 

207 if topic_text.endswith("?"): 

208 topic_text = topic_text[:-1].strip() 

209 break 

210 

211 # For compound topics, extract just the primary subject 

212 conjunctions = [ 

213 " and ", 

214 " or ", 

215 " but ", 

216 " as ", 

217 " that ", 

218 " which ", 

219 " when ", 

220 " where ", 

221 " how ", 

222 ] 

223 for conjunction in conjunctions: 

224 if conjunction in topic_text.lower(): 

225 # Take only the part before the conjunction 

226 topic_text = topic_text.split(conjunction)[ 

227 0 

228 ].strip() 

229 logger.info( 

230 f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'" 

231 ) 

232 break 

233 

234 # Clean up the topic if it starts with articles 

235 for article in ["a ", "an ", "the "]: 

236 if topic_text.lower().startswith(article): 

237 topic_text = topic_text[len(article) :].strip() 

238 

239 # Simpler prompt 

240 simple_prompt = f"""Break down this research topic into 3 simpler sub-questions: 

241 

242Research Topic: {topic_text} 

243Original Query: {query} 

244 

245Your task is to create 3 specific questions that will help thoroughly research this topic. 

246If the original query is already a question, use the core subject of that question. 

247 

248Sub-questions: 

2491. 

2502. 

2513. """ 

252 

253 simple_response = self.model.invoke(simple_prompt) 

254 

255 # Handle different response formats 

256 simple_text = "" 

257 if hasattr(simple_response, "content"): 

258 simple_text = simple_response.content.strip() 

259 else: 

260 simple_text = str(simple_response).strip() 

261 

262 # Check again for language model errors 

263 if ( 

264 "No language models are available" in simple_text 

265 or "Please install Ollama" in simple_text 

266 ): 

267 logger.warning( 

268 "LLM returned error in simplified prompt, using default questions" 

269 ) 

270 return self._generate_default_questions(query) 

271 

272 # Extract sub-queries from the simpler response 

273 for line in simple_text.split("\n"): 

274 line = line.strip() 

275 if ( 

276 line 

277 and not line.startswith("Sub-questions:") 

278 and len(line) > 10 

279 ): 

280 # Clean up numbering 

281 for prefix in ["1. ", "2. ", "3. ", "- ", "* "]: 

282 if line.startswith(prefix): 

283 line = line[len(prefix) :] 

284 sub_queries.append(line.strip()) 

285 

286 # If still no sub-queries, create default ones based on the original query 

287 if not sub_queries: 

288 logger.warning( 

289 "Failed to generate meaningful sub-queries, using default decomposition" 

290 ) 

291 return self._generate_default_questions(query) 

292 

293 logger.info( 

294 f"Generated {len(sub_queries)} sub-questions: {sub_queries}" 

295 ) 

296 return sub_queries[: self.max_subqueries] # Limit to max_subqueries 

297 

298 except Exception as e: 

299 logger.exception(f"Error generating sub-questions: {e!s}") 

300 # Fallback to basic questions in case of error 

301 return self._generate_default_questions(query) 

302 

303 def _generate_default_questions(self, query: str) -> List[str]: 

304 """ 

305 Generate default questions for a given query when LLM fails. 

306 

307 Args: 

308 query: The main research query 

309 

310 Returns: 

311 List of default questions 

312 """ 

313 # Adjust questions based on the type of query 

314 query = query.strip() 

315 

316 # Check if the query is already in question format 

317 question_prefixes = [ 

318 "what is", 

319 "what are", 

320 "how does", 

321 "how do", 

322 "how can", 

323 "why is", 

324 "why are", 

325 "when did", 

326 "where is", 

327 "which", 

328 "who is", 

329 "can", 

330 "will", 

331 ] 

332 

333 # Extract the subject from a question-format query 

334 subject = query 

335 lower_query = query.lower() 

336 

337 # Check for common question formats and extract the subject 

338 if lower_query.endswith("?"): 

339 # Remove the question mark 

340 subject = query[:-1].strip() 

341 

342 # Check for common question beginnings and extract the subject 

343 for prefix in question_prefixes: 

344 if lower_query.startswith(prefix): 

345 # Extract everything after the question prefix 

346 subject = query[len(prefix) :].strip() 

347 # Remove trailing ? if present 

348 if subject.endswith("?"): 

349 subject = subject[:-1].strip() 

350 break 

351 

352 # For compound questions, extract just the primary subject 

353 # Look for conjunctions and prepositions that typically separate the subject from the rest 

354 conjunctions = [ 

355 " and ", 

356 " or ", 

357 " but ", 

358 " as ", 

359 " that ", 

360 " which ", 

361 " when ", 

362 " where ", 

363 " how ", 

364 ] 

365 for conjunction in conjunctions: 

366 if conjunction in subject.lower(): 

367 # Take only the part before the conjunction 

368 subject = subject.split(conjunction)[0].strip() 

369 logger.info( 

370 f"Split compound question at '{conjunction}', extracted: '{subject}'" 

371 ) 

372 break 

373 

374 # Clean up the subject if it starts with articles 

375 for article in ["a ", "an ", "the "]: 

376 if subject.lower().startswith(article): 

377 subject = subject[len(article) :].strip() 

378 

379 # For single word or very short subjects, adapt the question format 

380 is_short_subject = len(subject.split()) <= 2 

381 

382 logger.info( 

383 f"Query: '{query}', Identified subject: '{subject}', Short subject: {is_short_subject}" 

384 ) 

385 

386 # Special case for CSRF - if we've extracted just "csrf" from a longer query 

387 if ( 

388 subject.lower() == "csrf" 

389 or subject.lower() == "cross-site request forgery" 

390 ): 

391 # CSRF-specific questions 

392 default_questions = [ 

393 "What is Cross-Site Request Forgery (CSRF)?", 

394 "How do CSRF attacks work and what are common attack vectors?", 

395 "What are effective CSRF prevention methods and best practices?", 

396 "How do CSRF tokens work to prevent attacks?", 

397 "What are real-world examples of CSRF vulnerabilities and their impact?", 

398 ] 

399 elif not subject: 

400 # Empty query case 

401 default_questions = [ 

402 "What is the definition of this topic?", 

403 "What are the key aspects of this topic?", 

404 "What are practical applications of this concept?", 

405 ] 

406 elif any( 

407 term in subject.lower() 

408 for term in ["secure", "security", "vulnerability", "attack"] 

409 ): 

410 # Security-related questions 

411 default_questions = [ 

412 f"What is {subject} and how does it work?", 

413 f"What are common {subject} vulnerabilities or attack vectors?", 

414 f"What are best practices for preventing {subject} issues?", 

415 f"How can {subject} be detected and mitigated?", 

416 f"What are real-world examples of {subject} incidents?", 

417 ] 

418 elif any( 

419 term in subject.lower() 

420 for term in ["programming", "language", "code", "software"] 

421 ): 

422 # Programming-related questions 

423 default_questions = [ 

424 f"What is {subject} and how does it work?", 

425 f"What are the main features and advantages of {subject}?", 

426 f"What are common use cases and applications for {subject}?", 

427 f"How does {subject} compare to similar technologies?", 

428 f"What are best practices when working with {subject}?", 

429 ] 

430 elif is_short_subject: 

431 # For short subjects (1-2 words), use a dedicated format 

432 default_questions = [ 

433 f"What is {subject}?", 

434 f"What are the main characteristics of {subject}?", 

435 f"How is {subject} used in practice?", 

436 f"What are the advantages and disadvantages of {subject}?", 

437 f"How has {subject} evolved over time?", 

438 ] 

439 else: 

440 # Generic questions for any topic 

441 default_questions = [ 

442 f"What is the definition of {subject}?", 

443 f"What are the key components or features of {subject}?", 

444 f"What are common applications or use cases for {subject}?", 

445 f"What are the advantages and limitations of {subject}?", 

446 f"How does {subject} compare to alternatives?", 

447 ] 

448 

449 logger.info( 

450 f"Using {len(default_questions)} default questions: {default_questions}" 

451 ) 

452 return default_questions[: self.max_subqueries]