Coverage for src / local_deep_research / advanced_search_system / questions / decomposition_question.py: 4%
133 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from typing import List
3from langchain_core.language_models import BaseLLM
4from loguru import logger
6from .base_question import BaseQuestionGenerator
9class DecompositionQuestionGenerator(BaseQuestionGenerator):
10 """Question generator for decomposing complex queries into sub-queries."""
12 def __init__(self, model: BaseLLM, max_subqueries: int = 5):
13 """
14 Initialize the question generator.
16 Args:
17 model: The language model to use for question generation
18 max_subqueries: Maximum number of sub-queries to generate
19 """
20 super().__init__(model)
21 self.max_subqueries = max_subqueries
23 def generate_questions(
24 self,
25 query: str,
26 context: str,
27 **kwargs,
28 ) -> List[str]:
29 """
30 Generate sub-queries by decomposing the original query.
32 Args:
33 query: The main research query
34 context: Additional context for question generation
35 **kwargs: Additional keyword arguments
37 Returns:
38 List of generated sub-queries
39 """
40 # Extract subject if the query is in question format
41 subject = query
42 lower_query = query.lower()
44 if lower_query.endswith("?"):
45 # Handle question-format queries by extracting the subject
46 question_prefixes = [
47 "what is",
48 "what are",
49 "how does",
50 "how do",
51 "how can",
52 "why is",
53 "why are",
54 "when did",
55 "where is",
56 "which",
57 "who is",
58 "can",
59 "will",
60 ]
62 # Remove the question mark
63 subject_candidate = query[:-1].strip()
65 # Check for common question beginnings and extract the subject
66 for prefix in question_prefixes:
67 if lower_query.startswith(prefix):
68 # Extract everything after the question prefix
69 subject_candidate = query[len(prefix) :].strip()
70 # Remove trailing ? if present
71 if subject_candidate.endswith("?"):
72 subject_candidate = subject_candidate[:-1].strip()
73 subject = subject_candidate
74 break
76 # For compound questions, extract just the primary subject
77 conjunctions = [
78 " and ",
79 " or ",
80 " but ",
81 " as ",
82 " that ",
83 " which ",
84 " when ",
85 " where ",
86 " how ",
87 ]
88 for conjunction in conjunctions:
89 if conjunction in subject.lower():
90 # Take only the part before the conjunction
91 subject = subject.split(conjunction)[0].strip()
92 logger.info(
93 f"Split compound question at '{conjunction}', extracted: '{subject}'"
94 )
95 break
97 # Clean up the subject if it starts with articles
98 for article in ["a ", "an ", "the "]:
99 if subject.lower().startswith(article):
100 subject = subject[len(article) :].strip()
102 logger.info(
103 f"Original query: '{query}', Extracted subject: '{subject}'"
104 )
106 # Create a prompt to decompose the query into sub-questions
107 prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.
108Focus on breaking down complex concepts and identifying key aspects requiring separate investigation.
109Ensure sub-queries are clear, targeted, and help build a comprehensive understanding.
111Main Research Topic: {subject}
112Original Query: {query}
114Context Information:
115{context[:2000]} # Limit context length to prevent token limit issues
117Your task is to create 3-5 specific questions that will help thoroughly research this topic.
118If the original query is already a question, extract the core subject and formulate questions around that subject.
120Return ONLY the sub-queries, one per line, without numbering or bullet points.
121Example format:
122What is X technology?
123How does X compare to Y?
124What are the security implications of X?
125"""
127 logger.info(
128 f"Generating sub-questions for query: '{query}', subject: '{subject}'"
129 )
131 try:
132 # Get response from LLM
133 response = self.model.invoke(prompt)
135 # Handle different response formats (string or object with content attribute)
136 sub_queries_text = ""
137 if hasattr(response, "content"):
138 sub_queries_text = response.content.strip()
139 else:
140 # Handle string responses
141 sub_queries_text = str(response).strip()
143 # Check for the common "No language models available" error
144 if (
145 "No language models are available" in sub_queries_text
146 or "Please install Ollama" in sub_queries_text
147 ):
148 logger.warning(
149 "LLM returned error about language models not being available, using default questions"
150 )
151 # Create topic-specific default questions based on the query
152 return self._generate_default_questions(query)
154 # Extract sub-queries (one per line)
155 sub_queries = []
156 for line in sub_queries_text.split("\n"):
157 line = line.strip()
158 # Skip empty lines and lines that are just formatting (bullets, numbers)
159 if (
160 not line
161 or line in ["*", "-", "•"]
162 or line.startswith(("- ", "* ", "• ", "1. ", "2. ", "3. "))
163 ):
164 continue
166 # Remove any leading bullets or numbers if they exist
167 clean_line = line
168 for prefix in [
169 "- ",
170 "* ",
171 "• ",
172 "1. ",
173 "2. ",
174 "3. ",
175 "4. ",
176 "5. ",
177 "- ",
178 "#",
179 ]:
180 if clean_line.startswith(prefix):
181 clean_line = clean_line[len(prefix) :]
183 if (
184 clean_line and len(clean_line) > 10
185 ): # Ensure it's a meaningful question
186 sub_queries.append(clean_line)
188 # If no sub-queries were extracted, try again with a simpler prompt
189 if not sub_queries:
190 logger.warning(
191 "No sub-queries extracted from first attempt, trying simplified approach"
192 )
194 # Determine if the query is already a question and extract the subject
195 topic_text = query
196 if query.lower().endswith("?"):
197 # Try to extract subject from question
198 for prefix in [
199 "what is",
200 "what are",
201 "how does",
202 "how can",
203 "why is",
204 ]:
205 if query.lower().startswith(prefix):
206 topic_text = query[len(prefix) :].strip()
207 if topic_text.endswith("?"):
208 topic_text = topic_text[:-1].strip()
209 break
211 # For compound topics, extract just the primary subject
212 conjunctions = [
213 " and ",
214 " or ",
215 " but ",
216 " as ",
217 " that ",
218 " which ",
219 " when ",
220 " where ",
221 " how ",
222 ]
223 for conjunction in conjunctions:
224 if conjunction in topic_text.lower():
225 # Take only the part before the conjunction
226 topic_text = topic_text.split(conjunction)[
227 0
228 ].strip()
229 logger.info(
230 f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"
231 )
232 break
234 # Clean up the topic if it starts with articles
235 for article in ["a ", "an ", "the "]:
236 if topic_text.lower().startswith(article):
237 topic_text = topic_text[len(article) :].strip()
239 # Simpler prompt
240 simple_prompt = f"""Break down this research topic into 3 simpler sub-questions:
242Research Topic: {topic_text}
243Original Query: {query}
245Your task is to create 3 specific questions that will help thoroughly research this topic.
246If the original query is already a question, use the core subject of that question.
248Sub-questions:
2491.
2502.
2513. """
253 simple_response = self.model.invoke(simple_prompt)
255 # Handle different response formats
256 simple_text = ""
257 if hasattr(simple_response, "content"):
258 simple_text = simple_response.content.strip()
259 else:
260 simple_text = str(simple_response).strip()
262 # Check again for language model errors
263 if (
264 "No language models are available" in simple_text
265 or "Please install Ollama" in simple_text
266 ):
267 logger.warning(
268 "LLM returned error in simplified prompt, using default questions"
269 )
270 return self._generate_default_questions(query)
272 # Extract sub-queries from the simpler response
273 for line in simple_text.split("\n"):
274 line = line.strip()
275 if (
276 line
277 and not line.startswith("Sub-questions:")
278 and len(line) > 10
279 ):
280 # Clean up numbering
281 for prefix in ["1. ", "2. ", "3. ", "- ", "* "]:
282 if line.startswith(prefix):
283 line = line[len(prefix) :]
284 sub_queries.append(line.strip())
286 # If still no sub-queries, create default ones based on the original query
287 if not sub_queries:
288 logger.warning(
289 "Failed to generate meaningful sub-queries, using default decomposition"
290 )
291 return self._generate_default_questions(query)
293 logger.info(
294 f"Generated {len(sub_queries)} sub-questions: {sub_queries}"
295 )
296 return sub_queries[: self.max_subqueries] # Limit to max_subqueries
298 except Exception as e:
299 logger.exception(f"Error generating sub-questions: {e!s}")
300 # Fallback to basic questions in case of error
301 return self._generate_default_questions(query)
303 def _generate_default_questions(self, query: str) -> List[str]:
304 """
305 Generate default questions for a given query when LLM fails.
307 Args:
308 query: The main research query
310 Returns:
311 List of default questions
312 """
313 # Adjust questions based on the type of query
314 query = query.strip()
316 # Check if the query is already in question format
317 question_prefixes = [
318 "what is",
319 "what are",
320 "how does",
321 "how do",
322 "how can",
323 "why is",
324 "why are",
325 "when did",
326 "where is",
327 "which",
328 "who is",
329 "can",
330 "will",
331 ]
333 # Extract the subject from a question-format query
334 subject = query
335 lower_query = query.lower()
337 # Check for common question formats and extract the subject
338 if lower_query.endswith("?"):
339 # Remove the question mark
340 subject = query[:-1].strip()
342 # Check for common question beginnings and extract the subject
343 for prefix in question_prefixes:
344 if lower_query.startswith(prefix):
345 # Extract everything after the question prefix
346 subject = query[len(prefix) :].strip()
347 # Remove trailing ? if present
348 if subject.endswith("?"):
349 subject = subject[:-1].strip()
350 break
352 # For compound questions, extract just the primary subject
353 # Look for conjunctions and prepositions that typically separate the subject from the rest
354 conjunctions = [
355 " and ",
356 " or ",
357 " but ",
358 " as ",
359 " that ",
360 " which ",
361 " when ",
362 " where ",
363 " how ",
364 ]
365 for conjunction in conjunctions:
366 if conjunction in subject.lower():
367 # Take only the part before the conjunction
368 subject = subject.split(conjunction)[0].strip()
369 logger.info(
370 f"Split compound question at '{conjunction}', extracted: '{subject}'"
371 )
372 break
374 # Clean up the subject if it starts with articles
375 for article in ["a ", "an ", "the "]:
376 if subject.lower().startswith(article):
377 subject = subject[len(article) :].strip()
379 # For single word or very short subjects, adapt the question format
380 is_short_subject = len(subject.split()) <= 2
382 logger.info(
383 f"Query: '{query}', Identified subject: '{subject}', Short subject: {is_short_subject}"
384 )
386 # Special case for CSRF - if we've extracted just "csrf" from a longer query
387 if (
388 subject.lower() == "csrf"
389 or subject.lower() == "cross-site request forgery"
390 ):
391 # CSRF-specific questions
392 default_questions = [
393 "What is Cross-Site Request Forgery (CSRF)?",
394 "How do CSRF attacks work and what are common attack vectors?",
395 "What are effective CSRF prevention methods and best practices?",
396 "How do CSRF tokens work to prevent attacks?",
397 "What are real-world examples of CSRF vulnerabilities and their impact?",
398 ]
399 elif not subject:
400 # Empty query case
401 default_questions = [
402 "What is the definition of this topic?",
403 "What are the key aspects of this topic?",
404 "What are practical applications of this concept?",
405 ]
406 elif any(
407 term in subject.lower()
408 for term in ["secure", "security", "vulnerability", "attack"]
409 ):
410 # Security-related questions
411 default_questions = [
412 f"What is {subject} and how does it work?",
413 f"What are common {subject} vulnerabilities or attack vectors?",
414 f"What are best practices for preventing {subject} issues?",
415 f"How can {subject} be detected and mitigated?",
416 f"What are real-world examples of {subject} incidents?",
417 ]
418 elif any(
419 term in subject.lower()
420 for term in ["programming", "language", "code", "software"]
421 ):
422 # Programming-related questions
423 default_questions = [
424 f"What is {subject} and how does it work?",
425 f"What are the main features and advantages of {subject}?",
426 f"What are common use cases and applications for {subject}?",
427 f"How does {subject} compare to similar technologies?",
428 f"What are best practices when working with {subject}?",
429 ]
430 elif is_short_subject:
431 # For short subjects (1-2 words), use a dedicated format
432 default_questions = [
433 f"What is {subject}?",
434 f"What are the main characteristics of {subject}?",
435 f"How is {subject} used in practice?",
436 f"What are the advantages and disadvantages of {subject}?",
437 f"How has {subject} evolved over time?",
438 ]
439 else:
440 # Generic questions for any topic
441 default_questions = [
442 f"What is the definition of {subject}?",
443 f"What are the key components or features of {subject}?",
444 f"What are common applications or use cases for {subject}?",
445 f"What are the advantages and limitations of {subject}?",
446 f"How does {subject} compare to alternatives?",
447 ]
449 logger.info(
450 f"Using {len(default_questions)} default questions: {default_questions}"
451 )
452 return default_questions[: self.max_subqueries]