Coverage for src/local_deep_research/advanced_search_system/questions/decomposition

1from typing import List

3from langchain_core.language_models import BaseLLM

4from loguru import logger

6from .base_question import BaseQuestionGenerator

9class DecompositionQuestionGenerator(BaseQuestionGenerator):

10 """Question generator for decomposing complex queries into sub-queries."""

12 def __init__(self, model: BaseLLM, max_subqueries: int = 5):

13 """

14 Initialize the question generator.

16 Args:

17 model: The language model to use for question generation

18 max_subqueries: Maximum number of sub-queries to generate

19 """

20 super().__init__(model)

21 self.max_subqueries = max_subqueries

23 def generate_questions(

24 self,

25 query: str,

26 context: str,

27 **kwargs,

28 ) -> List[str]:

29 """

30 Generate sub-queries by decomposing the original query.

32 Args:

33 query: The main research query

34 context: Additional context for question generation

35 **kwargs: Additional keyword arguments

37 Returns:

38 List of generated sub-queries

39 """

40 # Extract subject if the query is in question format

41 subject = query

42 lower_query = query.lower()

44 if lower_query.endswith("?"):

45 # Handle question-format queries by extracting the subject

46 question_prefixes = [

47 "what is",

48 "what are",

49 "how does",

50 "how do",

51 "how can",

52 "why is",

53 "why are",

54 "when did",

55 "where is",

56 "which",

57 "who is",

58 "can",

59 "will",

60 ]

62 # Remove the question mark

63 subject_candidate = query[:-1].strip()

65 # Check for common question beginnings and extract the subject

66 for prefix in question_prefixes: 66 ↛ 77line 66 didn't jump to line 77 because the loop on line 66 didn't complete

67 if lower_query.startswith(prefix):

68 # Extract everything after the question prefix

69 subject_candidate = query[len(prefix) :].strip()

70 # Remove trailing ? if present

71 if subject_candidate.endswith("?"): 71 ↛ 73line 71 didn't jump to line 73 because the condition on line 71 was always true

72 subject_candidate = subject_candidate[:-1].strip()

73 subject = subject_candidate

74 break

76 # For compound questions, extract just the primary subject

77 conjunctions = [

78 " and ",

79 " or ",

80 " but ",

81 " as ",

82 " that ",

83 " which ",

84 " when ",

85 " where ",

86 " how ",

87 ]

88 for conjunction in conjunctions:

89 if conjunction in subject.lower():

90 # Take only the part before the conjunction

91 subject = subject.split(conjunction)[0].strip()

92 logger.info(

93 f"Split compound question at '{conjunction}', extracted: '{subject}'"

94 )

95 break

97 # Clean up the subject if it starts with articles

98 for article in ["a ", "an ", "the "]:

99 if subject.lower().startswith(article):

100 subject = subject[len(article) :].strip()

101

102 logger.info(

103 f"Original query: '{query}', Extracted subject: '{subject}'"

104 )

105

106 # Create a prompt to decompose the query into sub-questions

107 prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.

108Focus on breaking down complex concepts and identifying key aspects requiring separate investigation.

109Ensure sub-queries are clear, targeted, and help build a comprehensive understanding.

110

111Main Research Topic: {subject}

112Original Query: {query}

113

114Context Information:

115{context[:2000]} # Limit context length to prevent token limit issues

116

117Your task is to create 3-5 specific questions that will help thoroughly research this topic.

118If the original query is already a question, extract the core subject and formulate questions around that subject.

119

120Return ONLY the sub-queries, one per line, without numbering or bullet points.

121Example format:

122What is X technology?

123How does X compare to Y?

124What are the security implications of X?

125"""

126

127 logger.info(

128 f"Generating sub-questions for query: '{query}', subject: '{subject}'"

129 )

130

131 try:

132 # Get response from LLM

133 response = self.model.invoke(prompt)

134

135 # Handle different response formats (string or object with content attribute)

136 sub_queries_text = ""

137 if hasattr(response, "content"):

138 sub_queries_text = response.content.strip()

139 else:

140 # Handle string responses

141 sub_queries_text = str(response).strip()

142

143 # Check for the common "No language models available" error

144 if (

145 "No language models are available" in sub_queries_text

146 or "Please install Ollama" in sub_queries_text

147 ):

148 logger.warning(

149 "LLM returned error about language models not being available, using default questions"

150 )

151 # Create topic-specific default questions based on the query

152 return self._generate_default_questions(query)

153

154 # Extract sub-queries (one per line)

155 sub_queries = []

156 for line in sub_queries_text.split("\n"):

157 line = line.strip()

158 # Skip empty lines and lines that are just formatting (bullets, numbers)

159 if (

160 not line

161 or line in ["*", "-", "•"]

162 or line.startswith(("- ", "* ", "• ", "1. ", "2. ", "3. "))

163 ):

164 continue

165

166 # Remove any leading bullets or numbers if they exist

167 clean_line = line

168 for prefix in [

169 "- ",

170 "* ",

171 "• ",

172 "1. ",

173 "2. ",

174 "3. ",

175 "4. ",

176 "5. ",

177 "- ",

178 "#",

179 ]:

180 if clean_line.startswith(prefix): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true

181 clean_line = clean_line[len(prefix) :]

182

183 if (

184 clean_line and len(clean_line) > 10

185 ): # Ensure it's a meaningful question

186 sub_queries.append(clean_line)

187

188 # If no sub-queries were extracted, try again with a simpler prompt

189 if not sub_queries:

190 logger.warning(

191 "No sub-queries extracted from first attempt, trying simplified approach"

192 )

193

194 # Determine if the query is already a question and extract the subject

195 topic_text = query

196 if query.lower().endswith("?"):

197 # Try to extract subject from question

198 for prefix in [ 198 ↛ 212line 198 didn't jump to line 212 because the loop on line 198 didn't complete

199 "what is",

200 "what are",

201 "how does",

202 "how can",

203 "why is",

204 ]:

205 if query.lower().startswith(prefix): 205 ↛ 198line 205 didn't jump to line 198 because the condition on line 205 was always true

206 topic_text = query[len(prefix) :].strip()

207 if topic_text.endswith("?"): 207 ↛ 209line 207 didn't jump to line 209 because the condition on line 207 was always true

208 topic_text = topic_text[:-1].strip()

209 break

210

211 # For compound topics, extract just the primary subject

212 conjunctions = [

213 " and ",

214 " or ",

215 " but ",

216 " as ",

217 " that ",

218 " which ",

219 " when ",

220 " where ",

221 " how ",

222 ]

223 for conjunction in conjunctions:

224 if conjunction in topic_text.lower(): 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was never true

225 # Take only the part before the conjunction

226 topic_text = topic_text.split(conjunction)[

227 0

228 ].strip()

229 logger.info(

230 f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"

231 )

232 break

233

234 # Clean up the topic if it starts with articles

235 for article in ["a ", "an ", "the "]:

236 if topic_text.lower().startswith(article):

237 topic_text = topic_text[len(article) :].strip()

238

239 # Simpler prompt

240 simple_prompt = f"""Break down this research topic into 3 simpler sub-questions:

241

242Research Topic: {topic_text}

243Original Query: {query}

244

245Your task is to create 3 specific questions that will help thoroughly research this topic.

246If the original query is already a question, use the core subject of that question.

247

248Sub-questions:

2491.

2502.

2513. """

252

253 simple_response = self.model.invoke(simple_prompt)

254

255 # Handle different response formats

256 simple_text = ""

257 if hasattr(simple_response, "content"): 257 ↛ 260line 257 didn't jump to line 260 because the condition on line 257 was always true

258 simple_text = simple_response.content.strip()

259 else:

260 simple_text = str(simple_response).strip()

261

262 # Check again for language model errors

263 if ( 263 ↛ 267line 263 didn't jump to line 267 because the condition on line 263 was never true

264 "No language models are available" in simple_text

265 or "Please install Ollama" in simple_text

266 ):

267 logger.warning(

268 "LLM returned error in simplified prompt, using default questions"

269 )

270 return self._generate_default_questions(query)

271

272 # Extract sub-queries from the simpler response

273 for line in simple_text.split("\n"):

274 line = line.strip()

275 if (

276 line

277 and not line.startswith("Sub-questions:")

278 and len(line) > 10

279 ):

280 # Clean up numbering

281 for prefix in ["1. ", "2. ", "3. ", "- ", "* "]:

282 if line.startswith(prefix):

283 line = line[len(prefix) :]

284 sub_queries.append(line.strip())

285

286 # If still no sub-queries, create default ones based on the original query

287 if not sub_queries:

288 logger.warning(

289 "Failed to generate meaningful sub-queries, using default decomposition"

290 )

291 return self._generate_default_questions(query)

292

293 logger.info(

294 f"Generated {len(sub_queries)} sub-questions: {sub_queries}"

295 )

296 return sub_queries[: self.max_subqueries] # Limit to max_subqueries

297

298 except Exception:

299 logger.exception("Error generating sub-questions")

300 # Fallback to basic questions in case of error

301 return self._generate_default_questions(query)

302

303 def _generate_default_questions(self, query: str) -> List[str]:

304 """

305 Generate default questions for a given query when LLM fails.

306

307 Args:

308 query: The main research query

309

310 Returns:

311 List of default questions

312 """

313 # Adjust questions based on the type of query

314 query = query.strip()

315

316 # Check if the query is already in question format

317 question_prefixes = [

318 "what is",

319 "what are",

320 "how does",

321 "how do",

322 "how can",

323 "why is",

324 "why are",

325 "when did",

326 "where is",

327 "which",

328 "who is",

329 "can",

330 "will",

331 ]

332

333 # Extract the subject from a question-format query

334 subject = query

335 lower_query = query.lower()

336

337 # Check for common question formats and extract the subject

338 if lower_query.endswith("?"):

339 # Remove the question mark

340 subject = query[:-1].strip()

341

342 # Check for common question beginnings and extract the subject

343 for prefix in question_prefixes: 343 ↛ 354line 343 didn't jump to line 354 because the loop on line 343 didn't complete

344 if lower_query.startswith(prefix): 344 ↛ 343line 344 didn't jump to line 343 because the condition on line 344 was always true

345 # Extract everything after the question prefix

346 subject = query[len(prefix) :].strip()

347 # Remove trailing ? if present

348 if subject.endswith("?"): 348 ↛ 350line 348 didn't jump to line 350 because the condition on line 348 was always true

349 subject = subject[:-1].strip()

350 break

351

352 # For compound questions, extract just the primary subject

353 # Look for conjunctions and prepositions that typically separate the subject from the rest

354 conjunctions = [

355 " and ",

356 " or ",

357 " but ",

358 " as ",

359 " that ",

360 " which ",

361 " when ",

362 " where ",

363 " how ",

364 ]

365 for conjunction in conjunctions:

366 if conjunction in subject.lower(): 366 ↛ 368line 366 didn't jump to line 368 because the condition on line 366 was never true

367 # Take only the part before the conjunction

368 subject = subject.split(conjunction)[0].strip()

369 logger.info(

370 f"Split compound question at '{conjunction}', extracted: '{subject}'"

371 )

372 break

373

374 # Clean up the subject if it starts with articles

375 for article in ["a ", "an ", "the "]:

376 if subject.lower().startswith(article):

377 subject = subject[len(article) :].strip()

378

379 # For single word or very short subjects, adapt the question format

380 is_short_subject = len(subject.split()) <= 2

381

382 logger.info(

383 f"Query: '{query}', Identified subject: '{subject}', Short subject: {is_short_subject}"

384 )

385

386 # Special case for CSRF - if we've extracted just "csrf" from a longer query

387 if (

388 subject.lower() == "csrf"

389 or subject.lower() == "cross-site request forgery"

390 ):

391 # CSRF-specific questions

392 default_questions = [

393 "What is Cross-Site Request Forgery (CSRF)?",

394 "How do CSRF attacks work and what are common attack vectors?",

395 "What are effective CSRF prevention methods and best practices?",

396 "How do CSRF tokens work to prevent attacks?",

397 "What are real-world examples of CSRF vulnerabilities and their impact?",

398 ]

399 elif not subject:

400 # Empty query case

401 default_questions = [

402 "What is the definition of this topic?",

403 "What are the key aspects of this topic?",

404 "What are practical applications of this concept?",

405 ]

406 elif any(

407 term in subject.lower()

408 for term in ["secure", "security", "vulnerability", "attack"]

409 ):

410 # Security-related questions

411 default_questions = [

412 f"What is {subject} and how does it work?",

413 f"What are common {subject} vulnerabilities or attack vectors?",

414 f"What are best practices for preventing {subject} issues?",

415 f"How can {subject} be detected and mitigated?",

416 f"What are real-world examples of {subject} incidents?",

417 ]

418 elif any(

419 term in subject.lower()

420 for term in ["programming", "language", "code", "software"]

421 ):

422 # Programming-related questions

423 default_questions = [

424 f"What is {subject} and how does it work?",

425 f"What are the main features and advantages of {subject}?",

426 f"What are common use cases and applications for {subject}?",

427 f"How does {subject} compare to similar technologies?",

428 f"What are best practices when working with {subject}?",

429 ]

430 elif is_short_subject:

431 # For short subjects (1-2 words), use a dedicated format

432 default_questions = [

433 f"What is {subject}?",

434 f"What are the main characteristics of {subject}?",

435 f"How is {subject} used in practice?",

436 f"What are the advantages and disadvantages of {subject}?",

437 f"How has {subject} evolved over time?",

438 ]

439 else:

440 # Generic questions for any topic

441 default_questions = [

442 f"What is the definition of {subject}?",

443 f"What are the key components or features of {subject}?",

444 f"What are common applications or use cases for {subject}?",

445 f"What are the advantages and limitations of {subject}?",

446 f"How does {subject} compare to alternatives?",

447 ]

448

449 logger.info(

450 f"Using {len(default_questions)} default questions: {default_questions}"

451 )

452 return default_questions[: self.max_subqueries]

Coverage for src / local_deep_research / advanced_search_system / questions / decomposition_question.py: 90%

133 statements