Coverage for src / local_deep_research / advanced_search_system / questions / browsecomp_question.py: 5%
151 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2BrowseComp-specific question generation that creates progressive, entity-focused searches.
3"""
5import re
6from typing import Dict, List
8from loguru import logger
10from .base_question import BaseQuestionGenerator
13class BrowseCompQuestionGenerator(BaseQuestionGenerator):
14 """
15 Question generator optimized for BrowseComp-style queries.
17 Key features:
18 1. Extract concrete entities (dates, numbers, names, places)
19 2. Generate progressive search combinations
20 3. Start broad, then narrow systematically
21 4. Focus on verifiable facts
22 """
24 def __init__(
25 self,
26 model,
27 knowledge_truncate_length: int = 1500,
28 previous_searches_limit: int = 10,
29 ):
30 """Initialize the question generator.
32 Args:
33 model: The LLM model to use for generation
34 knowledge_truncate_length: Max chars for knowledge in prompts (None=unlimited)
35 previous_searches_limit: Max previous searches to show (None=unlimited)
36 """
37 super().__init__(model)
38 self.extracted_entities = {}
39 self.search_progression = []
40 self.knowledge_truncate_length = knowledge_truncate_length
41 self.previous_searches_limit = previous_searches_limit
43 def generate_questions(
44 self,
45 current_knowledge: str,
46 query: str,
47 questions_per_iteration: int = 5,
48 questions_by_iteration: dict = None,
49 results_by_iteration: dict = None,
50 iteration: int = 1,
51 ) -> List[str]:
52 """Generate progressive search queries for BrowseComp problems."""
53 questions_by_iteration = questions_by_iteration or {}
55 # First iteration: Extract entities and create initial searches
56 if iteration == 1 or not self.extracted_entities:
57 self.extracted_entities = self._extract_entities(query)
58 return self._generate_initial_searches(
59 query, self.extracted_entities, questions_per_iteration
60 )
62 # Subsequent iterations: Progressive refinement
63 return self._generate_progressive_searches(
64 query,
65 current_knowledge,
66 self.extracted_entities,
67 questions_by_iteration,
68 results_by_iteration or {},
69 questions_per_iteration,
70 iteration,
71 )
73 def _extract_entities(self, query: str) -> Dict[str, List[str]]:
74 """Extract concrete entities from the query."""
75 prompt = f"""Extract ALL concrete, searchable entities from this query:
77Query: {query}
79Extract:
801. TEMPORAL: All years, dates, time periods (e.g., "2018", "between 1995 and 2006", "2023")
812. NUMERICAL: All numbers, statistics, counts (e.g., "300", "more than 3", "4-3", "84.5%")
823. NAMES: Partial names, name hints, proper nouns (e.g., "Dartmouth", "EMNLP", "Plastic Man")
834. LOCATIONS: Places, institutions, geographic features (e.g., "Pennsylvania", "Grand Canyon")
845. DESCRIPTORS: Key descriptive terms (e.g., "fourth wall", "ascetics", "decider game")
86For TEMPORAL entities, if there's a range (e.g., "between 2018-2023"), list EACH individual year.
88Format your response as:
89TEMPORAL: [entity1], [entity2], ...
90NUMERICAL: [entity1], [entity2], ...
91NAMES: [entity1], [entity2], ...
92LOCATIONS: [entity1], [entity2], ...
93DESCRIPTORS: [entity1], [entity2], ...
94"""
96 response = self.model.invoke(prompt)
97 content = (
98 response.content if hasattr(response, "content") else str(response)
99 )
101 entities = {
102 "temporal": [],
103 "numerical": [],
104 "names": [],
105 "locations": [],
106 "descriptors": [],
107 }
109 # current_category = None # Not currently used
110 for line in content.strip().split("\n"):
111 line = line.strip()
112 if ":" in line:
113 category, values = line.split(":", 1)
114 category = category.strip().lower()
115 if category in entities:
116 # Parse comma-separated values
117 values = [v.strip() for v in values.split(",") if v.strip()]
118 entities[category].extend(values)
120 # Expand temporal ranges
121 entities["temporal"] = self._expand_temporal_ranges(
122 entities["temporal"]
123 )
125 logger.info(f"Extracted entities: {entities}")
126 return entities
128 def _expand_temporal_ranges(
129 self, temporal_entities: List[str]
130 ) -> List[str]:
131 """Expand year ranges into individual years."""
132 expanded = []
133 for entity in temporal_entities:
134 # Check for range patterns like "2018-2023" or "between 1995 and 2006"
135 range_match = re.search(
136 r"(\d{4})[-\s]+(?:to|and)?\s*(\d{4})", entity
137 )
138 if range_match:
139 start_year = int(range_match.group(1))
140 end_year = int(range_match.group(2))
141 for year in range(start_year, end_year + 1):
142 expanded.append(str(year))
143 else:
144 # Single year or other temporal entity
145 year_match = re.search(r"\d{4}", entity)
146 if year_match:
147 expanded.append(year_match.group())
148 else:
149 expanded.append(entity)
151 return list(set(expanded)) # Remove duplicates
153 def _generate_initial_searches(
154 self, query: str, entities: Dict[str, List[str]], num_questions: int
155 ) -> List[str]:
156 """Generate initial broad searches."""
157 searches = []
159 # 1. Original query (always include)
160 searches.append(query)
162 # If only 1 question requested, return just the original query
163 if num_questions <= 1:
164 return searches[:1]
166 # 2. Domain exploration searches (combine key entities)
167 if entities["names"] and len(searches) < num_questions:
168 for name in entities["names"][:2]: # Top 2 names
169 if len(searches) >= num_questions:
170 break
171 searches.append(f"{name}")
172 if entities["descriptors"] and len(searches) < num_questions:
173 searches.append(f"{name} {entities['descriptors'][0]}")
175 # 3. Temporal searches if years are important
176 if (
177 entities["temporal"]
178 and len(entities["temporal"]) <= 10
179 and len(searches) < num_questions
180 ):
181 # For small year ranges, search each year with a key term
182 key_term = (
183 entities["names"][0]
184 if entities["names"]
185 else entities["descriptors"][0]
186 if entities["descriptors"]
187 else ""
188 )
189 for year in entities["temporal"][:5]: # Limit to 5 years initially
190 if len(searches) >= num_questions:
191 break
192 if key_term:
193 searches.append(f"{key_term} {year}")
195 # 4. Location-based searches
196 if entities["locations"] and len(searches) < num_questions:
197 for location in entities["locations"][:2]:
198 if len(searches) >= num_questions:
199 break
200 searches.append(f"{location}")
201 if entities["descriptors"] and len(searches) < num_questions:
202 searches.append(f"{location} {entities['descriptors'][0]}")
204 # Remove duplicates and limit to requested number
205 seen = set()
206 unique_searches = []
207 for s in searches:
208 if s.lower() not in seen:
209 seen.add(s.lower())
210 unique_searches.append(s)
211 if len(unique_searches) >= num_questions:
212 break
214 return unique_searches[:num_questions]
216 def _generate_progressive_searches(
217 self,
218 query: str,
219 current_knowledge: str,
220 entities: Dict[str, List[str]],
221 questions_by_iteration: dict,
222 results_by_iteration: dict,
223 num_questions: int,
224 iteration: int,
225 ) -> List[str]:
226 """Generate progressively more specific searches based on findings."""
228 # Only add strategy instructions if we have actual result data (adaptive mode)
229 strategy_instruction = ""
230 if results_by_iteration:
231 # Check if recent searches are failing (returning 0 results)
232 recent_iterations = [
233 i for i in range(max(1, iteration - 5), iteration)
234 ]
235 zero_count = sum(
236 1
237 for i in recent_iterations
238 if results_by_iteration.get(i, 1) == 0
239 )
240 searches_failing = zero_count >= 3
242 # Adjust strategy based on success/failure
243 if searches_failing:
244 strategy_instruction = """
245IMPORTANT: Your recent searches are returning 0 results - they are TOO NARROW!
246- Use FEWER constraints (1-2 terms instead of 4-5)
247- Try BROADER, more general searches
248- Remove overly specific combinations
249- Focus on key concepts, not detailed entity combinations
250"""
251 else:
252 strategy_instruction = """
253Focus on finding the specific answer by combining entities systematically.
254"""
256 # Analyze what we've found so far
257 prompt = f"""Based on our search progress, generate targeted follow-up searches.
258{strategy_instruction}
260Original Query: {query}
262Entities Found:
263- Names/Terms: {", ".join(entities["names"][:5])}
264- Years: {", ".join(entities["temporal"][:5])}
265- Locations: {", ".join(entities["locations"][:3])}
266- Key Features: {", ".join(entities["descriptors"][:3])}
268Current Knowledge Summary:
269{current_knowledge[: self.knowledge_truncate_length] if self.knowledge_truncate_length else current_knowledge}
271Previous Searches:
272{self._format_previous_searches(questions_by_iteration, results_by_iteration)}
274Generate {num_questions} NEW search queries that:
2751. Combine 2-3 entities we haven't tried together
2762. If we found candidate names, search for them with other constraints
2773. For year ranges, systematically cover years we haven't searched
2784. Use quotes for exact phrases when beneficial
280Focus on finding the specific answer, not general information.
282Format: One search per line
283"""
285 response = self.model.invoke(prompt)
286 content = (
287 response.content if hasattr(response, "content") else str(response)
288 )
290 # Extract searches from response
291 searches = []
292 for line in content.strip().split("\n"):
293 line = line.strip()
294 if line and not line.endswith(":") and len(line) > 5:
295 # Clean up common prefixes
296 for prefix in ["Q:", "Search:", "-", "*", "•"]:
297 if line.startswith(prefix):
298 line = line[len(prefix) :].strip()
299 if line:
300 searches.append(line)
302 # Ensure we have enough searches, but respect the limit
303 while len(searches) < num_questions:
304 # Generate combinations programmatically
305 if iteration <= 5 and entities["temporal"]:
306 # Continue with year-based searches
307 added_any = False
308 for year in entities["temporal"]:
309 if not self._was_searched(year, questions_by_iteration):
310 base_term = (
311 entities["names"][0] if entities["names"] else ""
312 )
313 searches.append(f"{base_term} {year}".strip())
314 added_any = True
315 if len(searches) >= num_questions:
316 break
317 if not added_any:
318 break # No more year searches to add
319 else:
320 # Combine multiple constraints
321 added_any = False
322 if entities["names"] and entities["descriptors"]:
323 for name in entities["names"]:
324 for desc in entities["descriptors"]:
325 combo = f"{name} {desc}"
326 if not self._was_searched(
327 combo, questions_by_iteration
328 ):
329 searches.append(combo)
330 added_any = True
331 if len(searches) >= num_questions:
332 break
333 if len(searches) >= num_questions:
334 break
335 if not added_any:
336 break # No more combinations to add
338 return searches[:num_questions]
340 def _format_previous_searches(
341 self, questions_by_iteration: dict, results_by_iteration: dict
342 ) -> str:
343 """Format previous searches for context with result counts."""
344 formatted = []
345 for iteration, questions in questions_by_iteration.items():
346 if isinstance(questions, list):
347 result_count = results_by_iteration.get(iteration, "?")
348 # Limit questions per iteration (main uses 3)
349 questions_to_show = (
350 questions[:3] if self.previous_searches_limit else questions
351 )
352 for q in questions_to_show:
353 # Only show result counts if we have actual data (not "?")
354 if result_count != "?":
355 formatted.append(
356 f"Iteration {iteration}: {q} ({result_count} results)"
357 )
358 else:
359 formatted.append(f"Iteration {iteration}: {q}")
360 # Apply limit if configured (main uses last 10)
361 if self.previous_searches_limit:
362 return "\n".join(formatted[-self.previous_searches_limit :])
363 return "\n".join(formatted)
365 def _was_searched(self, term: str, questions_by_iteration: dict) -> bool:
366 """Check if a term was already searched."""
367 term_lower = term.lower()
368 for questions in questions_by_iteration.values():
369 if isinstance(questions, list):
370 for q in questions:
371 if term_lower in q.lower():
372 return True
373 return False