Coverage for src/local_deep_research/web_search_engines/engines/search_engine_github.py: 99%

330 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1import base64 

2import json 

3import time 

4from typing import Any, Dict, List, Optional 

5 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...config import llm_config, search_config 

10from ...constants import USER_AGENT 

11from ...security.safe_requests import safe_get 

12from ...utilities.json_utils import extract_json, get_llm_response_text 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16_VALID_SEARCH_TYPES = frozenset({"repositories", "code", "issues", "users"}) 

17 

18 

19class GitHubSearchEngine(BaseSearchEngine): 

20 """ 

21 GitHub search engine implementation. 

22 Provides search across GitHub repositories, code, issues, and users. 

23 """ 

24 

25 is_lexical = True 

26 needs_llm_relevance_filter = True 

27 

28 def __init__( 

29 self, 

30 max_results: int = 15, 

31 api_key: Optional[str] = None, 

32 search_type: str = "repositories", 

33 include_readme: bool = True, 

34 include_issues: bool = False, 

35 llm: Optional[BaseLLM] = None, 

36 max_filtered_results: Optional[int] = None, 

37 settings_snapshot: Optional[Dict[str, Any]] = None, 

38 ): 

39 """ 

40 Initialize the GitHub search engine. 

41 

42 Args: 

43 max_results: Maximum number of search results 

44 api_key: GitHub API token (can also be set via LDR_SEARCH_ENGINE_WEB_GITHUB_API_KEY env var or in UI settings) 

45 search_type: Type of GitHub search ("repositories", "code", "issues", "users") 

46 include_readme: Whether to include README content for repositories 

47 include_issues: Whether to include recent issues for repositories 

48 llm: Language model for relevance filtering 

49 max_filtered_results: Maximum number of results to keep after filtering 

50 """ 

51 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

52 super().__init__( 

53 llm=llm, 

54 max_filtered_results=max_filtered_results, 

55 max_results=max_results, 

56 settings_snapshot=settings_snapshot, 

57 ) 

58 self.api_key = api_key 

59 if search_type not in _VALID_SEARCH_TYPES: 

60 raise ValueError( 

61 f"Invalid GitHub search_type: {search_type!r}. " 

62 f"Must be one of {_VALID_SEARCH_TYPES}" 

63 ) 

64 self.search_type = search_type 

65 self.include_readme = include_readme 

66 self.include_issues = include_issues 

67 

68 self._owns_llm = False 

69 

70 # API endpoints 

71 self.api_base = "https://api.github.com" 

72 self.search_endpoint = f"{self.api_base}/search/{search_type}" 

73 

74 # Set up API headers 

75 self.headers = { 

76 "Accept": "application/vnd.github.v3+json", 

77 "User-Agent": USER_AGENT, 

78 } 

79 

80 # Add authentication if API key provided 

81 if self.api_key: 

82 self.headers["Authorization"] = f"token {self.api_key}" 

83 logger.info("Using authenticated GitHub API requests") 

84 else: 

85 logger.warning( 

86 "No GitHub API key provided. Rate limits will be restricted." 

87 ) 

88 

89 def close(self) -> None: 

90 """Close the lazily-loaded LLM client if this engine created it.""" 

91 from ...utilities.resource_utils import safe_close 

92 

93 if self._owns_llm: 

94 safe_close(self.llm, "GitHub LLM") 

95 super().close() 

96 

97 def _handle_rate_limits(self, response): 

98 """Handle GitHub API rate limits by logging warnings and sleeping if necessary""" 

99 remaining = int(response.headers.get("X-RateLimit-Remaining", 60)) 

100 reset_time = int(response.headers.get("X-RateLimit-Reset", 0)) 

101 

102 if remaining < 5: 

103 current_time = time.time() 

104 wait_time = max(reset_time - current_time, 0) 

105 logger.warning( 

106 f"GitHub API rate limit almost reached. {remaining} requests remaining." 

107 ) 

108 

109 if wait_time > 0 and remaining == 0: 

110 logger.warning( 

111 f"GitHub API rate limit exceeded. Waiting {wait_time:.0f} seconds." 

112 ) 

113 time.sleep(min(wait_time, 60)) # Wait at most 60 seconds 

114 

115 def _optimize_github_query(self, query: str) -> str: 

116 """ 

117 Optimize the GitHub search query using LLM to improve search results. 

118 

119 Args: 

120 query: Original search query 

121 

122 Returns: 

123 Optimized GitHub search query 

124 """ 

125 # Get LLM from config if not already set 

126 if not self.llm: 

127 try: 

128 self.llm = llm_config.get_llm() 

129 self._owns_llm = True 

130 if not self.llm: 

131 logger.warning("No LLM available for query optimization") 

132 return query 

133 except Exception: 

134 logger.exception("Error getting LLM from config") 

135 return query 

136 

137 prompt = f"""Transform this GitHub search query into an optimized version for the GitHub search API. Follow these steps: 

138 1. Strip question words (e.g., 'what', 'are', 'is'), stop words (e.g., 'and', 'as', 'of', 'on'), and redundant terms (e.g., 'repositories', 'repos', 'github') since they're implied by the search context. 

139 2. Keep only domain-specific keywords and avoid using "-related" terms. 

140 3. Add GitHub-specific filters with dynamic thresholds based on query context: 

141 - For stars: Use higher threshold (e.g., 'stars:>1000') for mainstream topics, lower (e.g., 'stars:>50') for specialized topics 

142 - For language: Detect programming language from query or omit if unclear 

143 - For search scope: Use 'in:name,description,readme' for general queries, 'in:file' for code-specific queries 

144 4. For date ranges, adapt based on query context: 

145 - For emerging: Use 'created:>2024-01-01' 

146 - For mature: Use 'pushed:>2023-01-01' 

147 - For historical research: Use 'created:2020-01-01..2024-01-01' 

148 5. For excluding results, adapt based on query: 

149 - Exclude irrelevant languages based on context 

150 - Use 'NOT' to exclude competing terms 

151 6. Ensure the output is a concise, space-separated string with no punctuation or extra text beyond keywords and filters. 

152 

153 

154 Original query: "{query}" 

155 

156 Return ONLY the optimized query, ready for GitHub's search API. Do not include explanations or additional text.""" 

157 

158 try: 

159 response = self.llm.invoke(prompt) 

160 

161 # Handle different response formats (string or object with content attribute) 

162 if hasattr(response, "content"): 

163 optimized_query = str(response.content).strip() 

164 else: 

165 # Handle string responses 

166 optimized_query = str(response).strip() 

167 

168 # Validate the optimized query 

169 if optimized_query and len(optimized_query) > 0: 

170 logger.info( 

171 f"LLM optimized query from '{query}' to '{optimized_query}'" 

172 ) 

173 return optimized_query 

174 logger.warning("LLM returned empty query, using original") 

175 return query 

176 

177 except Exception: 

178 logger.exception("Error optimizing query with LLM") 

179 return query 

180 

181 def _search_github(self, query: str) -> List[Dict[str, Any]]: 

182 """ 

183 Perform a GitHub search based on the configured search type. 

184 

185 Args: 

186 query: The search query 

187 

188 Returns: 

189 List of GitHub search result items 

190 """ 

191 results = [] 

192 

193 try: 

194 # Optimize GitHub query using LLM 

195 github_query = self._optimize_github_query(query) 

196 

197 logger.info(f"Final GitHub query: {github_query}") 

198 

199 # Construct search parameters 

200 params = { 

201 "q": github_query, 

202 "per_page": min( 

203 self.max_results, 100 

204 ), # GitHub API max is 100 per page 

205 "page": 1, 

206 } 

207 

208 # Add sort parameters based on search type 

209 if self.search_type == "repositories": 

210 params["sort"] = "stars" 

211 params["order"] = "desc" 

212 elif self.search_type == "code": 

213 params["sort"] = "indexed" 

214 params["order"] = "desc" 

215 elif self.search_type == "issues": 

216 params["sort"] = "updated" 

217 params["order"] = "desc" 

218 elif self.search_type == "users": 218 ↛ 223line 218 didn't jump to line 223 because the condition on line 218 was always true

219 params["sort"] = "followers" 

220 params["order"] = "desc" 

221 

222 # Apply rate limiting before request 

223 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

224 self.engine_type 

225 ) 

226 

227 # Execute the API request 

228 response = safe_get( 

229 self.search_endpoint, headers=self.headers, params=params 

230 ) 

231 

232 # Check for rate limiting 

233 self._handle_rate_limits(response) 

234 

235 # Handle response with detailed logging 

236 if response.status_code == 200: 

237 data = response.json() 

238 total_count = data.get("total_count", 0) 

239 results = data.get("items", []) 

240 logger.info( 

241 f"GitHub search returned {len(results)} results (total available: {total_count})" 

242 ) 

243 

244 # Log the rate limit information 

245 rate_limit_remaining = response.headers.get( 

246 "X-RateLimit-Remaining", "unknown" 

247 ) 

248 logger.info( 

249 f"GitHub API rate limit: {rate_limit_remaining} requests remaining" 

250 ) 

251 

252 # If no results, try to provide more guidance 

253 if not results: 

254 logger.warning( 

255 "No results found. Consider these search tips:" 

256 ) 

257 logger.warning("1. Use shorter, more specific queries") 

258 logger.warning( 

259 "2. For repositories, try adding 'stars:>100' or 'language:python'" 

260 ) 

261 logger.warning( 

262 "3. For contribution opportunities, search for 'good-first-issue' or 'help-wanted'" 

263 ) 

264 else: 

265 logger.error( 

266 f"GitHub API error: {response.status_code} - {response.text}" 

267 ) 

268 

269 except Exception: 

270 logger.exception("Error searching GitHub") 

271 

272 return results 

273 

274 def _get_readme_content(self, repo_full_name: str) -> str: 

275 """ 

276 Get README content for a repository. 

277 

278 Args: 

279 repo_full_name: Full name of the repository (owner/repo) 

280 

281 Returns: 

282 Decoded README content or empty string if not found 

283 """ 

284 try: 

285 # Get README 

286 # Apply rate limiting before request 

287 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

288 self.engine_type 

289 ) 

290 

291 response = safe_get( 

292 f"{self.api_base}/repos/{repo_full_name}/readme", 

293 headers=self.headers, 

294 ) 

295 

296 # Check for rate limiting 

297 self._handle_rate_limits(response) 

298 

299 if response.status_code == 200: 

300 data = response.json() 

301 content: str = data.get("content", "") 

302 encoding = data.get("encoding", "") 

303 

304 if encoding == "base64" and content: 

305 return base64.b64decode(content).decode( 

306 "utf-8", errors="replace" 

307 ) 

308 return content 

309 logger.warning( 

310 f"Could not get README for {repo_full_name}: {response.status_code}" 

311 ) 

312 return "" 

313 

314 except Exception: 

315 logger.exception(f"Error getting README for {repo_full_name}") 

316 return "" 

317 

318 def _get_recent_issues( 

319 self, repo_full_name: str, limit: int = 5 

320 ) -> List[Dict[str, Any]]: 

321 """ 

322 Get recent issues for a repository. 

323 

324 Args: 

325 repo_full_name: Full name of the repository (owner/repo) 

326 limit: Maximum number of issues to return 

327 

328 Returns: 

329 List of recent issues 

330 """ 

331 issues = [] 

332 

333 try: 

334 # Get recent issues 

335 # Apply rate limiting before request 

336 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

337 self.engine_type 

338 ) 

339 

340 response = safe_get( 

341 f"{self.api_base}/repos/{repo_full_name}/issues", 

342 headers=self.headers, 

343 params={ 

344 "state": "all", 

345 "per_page": limit, 

346 "sort": "updated", 

347 "direction": "desc", 

348 }, 

349 ) 

350 

351 # Check for rate limiting 

352 self._handle_rate_limits(response) 

353 

354 if response.status_code == 200: 

355 issues = response.json() 

356 logger.info( 

357 f"Got {len(issues)} recent issues for {repo_full_name}" 

358 ) 

359 else: 

360 logger.warning( 

361 f"Could not get issues for {repo_full_name}: {response.status_code}" 

362 ) 

363 

364 except Exception: 

365 logger.exception(f"Error getting issues for {repo_full_name}") 

366 

367 return issues 

368 

369 def _get_file_content(self, file_url: str) -> str: 

370 """ 

371 Get content of a file from GitHub. 

372 

373 Args: 

374 file_url: API URL for the file 

375 

376 Returns: 

377 Decoded file content or empty string if not found 

378 """ 

379 try: 

380 # Apply rate limiting before request 

381 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

382 self.engine_type 

383 ) 

384 

385 # Get file content 

386 response = safe_get(file_url, headers=self.headers) 

387 

388 # Check for rate limiting 

389 self._handle_rate_limits(response) 

390 

391 if response.status_code == 200: 

392 data = response.json() 

393 content2: str = data.get("content", "") 

394 encoding = data.get("encoding", "") 

395 

396 if encoding == "base64" and content2: 

397 return base64.b64decode(content2).decode( 

398 "utf-8", errors="replace" 

399 ) 

400 return content2 

401 logger.warning( 

402 f"Could not get file content: {response.status_code}" 

403 ) 

404 return "" 

405 

406 except Exception: 

407 logger.exception("Error getting file content") 

408 return "" 

409 

410 def _format_repository_preview( 

411 self, repo: Dict[str, Any] 

412 ) -> Dict[str, Any]: 

413 """Format repository search result as preview""" 

414 return { 

415 "id": str(repo.get("id", "")), 

416 "title": repo.get("full_name", ""), 

417 "link": repo.get("html_url", ""), 

418 "snippet": repo.get("description", "No description provided"), 

419 "stars": repo.get("stargazers_count", 0), 

420 "forks": repo.get("forks_count", 0), 

421 "language": repo.get("language", ""), 

422 "updated_at": repo.get("updated_at", ""), 

423 "created_at": repo.get("created_at", ""), 

424 "topics": repo.get("topics", []), 

425 "owner": repo.get("owner", {}).get("login", ""), 

426 "is_fork": repo.get("fork", False), 

427 "search_type": "repository", 

428 "repo_full_name": repo.get("full_name", ""), 

429 } 

430 

431 def _format_code_preview(self, code: Dict[str, Any]) -> Dict[str, Any]: 

432 """Format code search result as preview""" 

433 repo = code.get("repository", {}) 

434 return { 

435 "id": f"code_{code.get('sha', '')}", 

436 "title": f"{code.get('name', '')} in {repo.get('full_name', '')}", 

437 "link": code.get("html_url", ""), 

438 "snippet": f"Match in {code.get('path', '')}", 

439 "path": code.get("path", ""), 

440 "repo_name": repo.get("full_name", ""), 

441 "repo_url": repo.get("html_url", ""), 

442 "search_type": "code", 

443 "file_url": code.get("url", ""), 

444 } 

445 

446 def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]: 

447 """Format issue search result as preview""" 

448 repo = ( 

449 issue.get("repository", {}) 

450 if "repository" in issue 

451 else {"full_name": ""} 

452 ) 

453 return { 

454 "id": f"issue_{issue.get('number', '')}", 

455 "title": issue.get("title", ""), 

456 "link": issue.get("html_url", ""), 

457 "snippet": ( 

458 issue.get("body", "")[:200] + "..." 

459 if len(issue.get("body", "")) > 200 

460 else issue.get("body", "") 

461 ), 

462 "state": issue.get("state", ""), 

463 "created_at": issue.get("created_at", ""), 

464 "updated_at": issue.get("updated_at", ""), 

465 "user": issue.get("user", {}).get("login", ""), 

466 "comments": issue.get("comments", 0), 

467 "search_type": "issue", 

468 "repo_name": repo.get("full_name", ""), 

469 } 

470 

471 def _format_user_preview(self, user: Dict[str, Any]) -> Dict[str, Any]: 

472 """Format user search result as preview""" 

473 return { 

474 "id": f"user_{user.get('id', '')}", 

475 "title": user.get("login", ""), 

476 "link": user.get("html_url", ""), 

477 "snippet": user.get("bio", "No bio provided"), 

478 "name": user.get("name", ""), 

479 "followers": user.get("followers", 0), 

480 "public_repos": user.get("public_repos", 0), 

481 "location": user.get("location", ""), 

482 "search_type": "user", 

483 "user_login": user.get("login", ""), 

484 } 

485 

486 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

487 """ 

488 Get preview information for GitHub search results. 

489 

490 Args: 

491 query: The search query 

492 

493 Returns: 

494 List of preview dictionaries 

495 """ 

496 logger.info(f"Getting GitHub previews for query: {query}") 

497 

498 # For contribution-focused queries, automatically adjust search type and add filters 

499 if any( 

500 term in query.lower() 

501 for term in [ 

502 "contribute", 

503 "contributing", 

504 "contribution", 

505 "beginner", 

506 "newcomer", 

507 ] 

508 ): 

509 # Use repositories search with help-wanted or good-first-issue labels 

510 original_search_type = self.search_type 

511 self.search_type = "repositories" 

512 self.search_endpoint = f"{self.api_base}/search/repositories" 

513 

514 # Create a specialized query for finding beginner-friendly projects 

515 specialized_query = "good-first-issues:>5 is:public archived:false" 

516 

517 # Extract language preferences if present 

518 languages = [] 

519 for lang in [ 

520 "python", 

521 "javascript", 

522 "java", 

523 "rust", 

524 "go", 

525 "typescript", 

526 "c#", 

527 "c++", 

528 "ruby", 

529 ]: 

530 if lang in query.lower(): 

531 languages.append(lang) 

532 

533 if languages: 

534 specialized_query += f" language:{' language:'.join(languages)}" 

535 

536 # Extract keywords 

537 keywords = [ 

538 word 

539 for word in query.split() 

540 if len(word) > 3 

541 and word.lower() 

542 not in [ 

543 "recommend", 

544 "recommended", 

545 "github", 

546 "repositories", 

547 "looking", 

548 "developers", 

549 "contribute", 

550 "contributing", 

551 "beginner", 

552 "newcomer", 

553 ] 

554 ] 

555 

556 if keywords: 556 ↛ 561line 556 didn't jump to line 561 because the condition on line 556 was always true

557 specialized_query += " " + " ".join( 

558 keywords[:5] 

559 ) # Add up to 5 keywords 

560 

561 logger.info( 

562 f"Using specialized contribution query: {specialized_query}" 

563 ) 

564 

565 # Perform GitHub search with specialized query 

566 results = self._search_github(specialized_query) 

567 

568 # Restore original search type 

569 self.search_type = original_search_type 

570 self.search_endpoint = f"{self.api_base}/search/{self.search_type}" 

571 else: 

572 # Perform standard GitHub search 

573 results = self._search_github(query) 

574 

575 if not results: 

576 logger.warning(f"No GitHub results found for query: {query}") 

577 return [] 

578 

579 # Format results as previews 

580 previews = [] 

581 for result in results: 

582 # Format based on search type 

583 if self.search_type == "repositories": 

584 preview = self._format_repository_preview(result) 

585 elif self.search_type == "code": 

586 preview = self._format_code_preview(result) 

587 elif self.search_type == "issues": 

588 preview = self._format_issue_preview(result) 

589 elif self.search_type == "users": 589 ↛ 592line 589 didn't jump to line 592 because the condition on line 589 was always true

590 preview = self._format_user_preview(result) 

591 else: 

592 logger.warning(f"Unknown search type: {self.search_type}") 

593 continue 

594 

595 previews.append(preview) 

596 

597 logger.info(f"Formatted {len(previews)} GitHub preview results") 

598 return previews 

599 

600 def _get_full_content( 

601 self, relevant_items: List[Dict[str, Any]] 

602 ) -> List[Dict[str, Any]]: 

603 """ 

604 Get full content for the relevant GitHub search results. 

605 

606 Args: 

607 relevant_items: List of relevant preview dictionaries 

608 

609 Returns: 

610 List of result dictionaries with full content 

611 """ 

612 # Check if we should add full content 

613 if ( 

614 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

615 and search_config.SEARCH_SNIPPETS_ONLY 

616 ): 

617 logger.info("Snippet-only mode, skipping full content retrieval") 

618 return relevant_items 

619 

620 logger.info( 

621 f"Getting full content for {len(relevant_items)} GitHub results" 

622 ) 

623 

624 results = [] 

625 for item in relevant_items: 

626 result = item.copy() 

627 search_type = item.get("search_type", "") 

628 

629 # Add content based on search type 

630 if search_type == "repository" and self.include_readme: 

631 repo_full_name = item.get("repo_full_name", "") 

632 if repo_full_name: 

633 # Get README content 

634 readme_content = self._get_readme_content(repo_full_name) 

635 result["full_content"] = readme_content 

636 result["content_type"] = "readme" 

637 

638 # Get recent issues if requested 

639 if self.include_issues: 

640 issues = self._get_recent_issues(repo_full_name) 

641 result["recent_issues"] = issues 

642 

643 elif search_type == "code": 

644 file_url = item.get("file_url", "") 

645 if file_url: 

646 # Get file content 

647 file_content = self._get_file_content(file_url) 

648 result["full_content"] = file_content 

649 result["content_type"] = "file" 

650 

651 elif search_type == "issue": 

652 # For issues, the snippet usually contains a summary already 

653 # We'll just keep it as is 

654 result["full_content"] = item.get("snippet", "") 

655 result["content_type"] = "issue" 

656 

657 elif search_type == "user": 

658 # For users, construct a profile summary 

659 profile_summary = f"GitHub user: {item.get('title', '')}\n" 

660 

661 if item.get("name"): 

662 profile_summary += f"Name: {item.get('name')}\n" 

663 

664 if item.get("location"): 

665 profile_summary += f"Location: {item.get('location')}\n" 

666 

667 profile_summary += f"Followers: {item.get('followers', 0)}\n" 

668 profile_summary += ( 

669 f"Public repositories: {item.get('public_repos', 0)}\n" 

670 ) 

671 

672 if ( 

673 item.get("snippet") 

674 and item.get("snippet") != "No bio provided" 

675 ): 

676 profile_summary += f"\nBio: {item.get('snippet')}\n" 

677 

678 result["full_content"] = profile_summary 

679 result["content_type"] = "user_profile" 

680 

681 results.append(result) 

682 

683 return results 

684 

685 def search_repository( 

686 self, repo_owner: str, repo_name: str 

687 ) -> Dict[str, Any]: 

688 """ 

689 Get detailed information about a specific repository. 

690 

691 Args: 

692 repo_owner: Owner of the repository 

693 repo_name: Name of the repository 

694 

695 Returns: 

696 Dictionary with repository information 

697 """ 

698 repo_full_name = f"{repo_owner}/{repo_name}" 

699 logger.info(f"Getting details for repository: {repo_full_name}") 

700 

701 try: 

702 # Get repository details 

703 # Apply rate limiting before request 

704 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

705 self.engine_type 

706 ) 

707 

708 response = safe_get( 

709 f"{self.api_base}/repos/{repo_full_name}", headers=self.headers 

710 ) 

711 

712 # Check for rate limiting 

713 self._handle_rate_limits(response) 

714 

715 if response.status_code == 200: 

716 repo = response.json() 

717 

718 # Format as repository preview 

719 result = self._format_repository_preview(repo) 

720 

721 # Add README content if requested 

722 if self.include_readme: 

723 readme_content = self._get_readme_content(repo_full_name) 

724 result["full_content"] = readme_content 

725 result["content_type"] = "readme" 

726 

727 # Add recent issues if requested 

728 if self.include_issues: 

729 issues = self._get_recent_issues(repo_full_name) 

730 result["recent_issues"] = issues 

731 

732 return result 

733 logger.error( 

734 f"Error getting repository details: {response.status_code} - {response.text}" 

735 ) 

736 return {} 

737 

738 except Exception: 

739 logger.exception("Error getting repository details") 

740 return {} 

741 

742 def search_code( 

743 self, 

744 query: str, 

745 language: Optional[str] = None, 

746 user: Optional[str] = None, 

747 ) -> List[Dict[str, Any]]: 

748 """ 

749 Search for code with more specific parameters. 

750 

751 Args: 

752 query: Code search query 

753 language: Filter by programming language 

754 user: Filter by GitHub username/organization 

755 

756 Returns: 

757 List of code search results 

758 """ 

759 # Build advanced query 

760 advanced_query = query 

761 

762 if language: 

763 advanced_query += f" language:{language}" 

764 

765 if user: 

766 advanced_query += f" user:{user}" 

767 

768 # Save current search type 

769 original_search_type = self.search_type 

770 

771 try: 

772 # Set search type to code 

773 self.search_type = "code" 

774 self.search_endpoint = f"{self.api_base}/search/code" 

775 

776 # Perform search 

777 results = self._search_github(advanced_query) 

778 

779 # Format results 

780 previews = [self._format_code_preview(result) for result in results] 

781 

782 # Get full content if requested 

783 if ( 

784 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

785 and not search_config.SEARCH_SNIPPETS_ONLY 

786 ): 

787 return self._get_full_content(previews) 

788 

789 return previews 

790 

791 finally: 

792 # Restore original search type 

793 self.search_type = original_search_type 

794 self.search_endpoint = f"{self.api_base}/search/{self.search_type}" 

795 

796 def search_issues( 

797 self, query: str, state: str = "open", sort: str = "updated" 

798 ) -> List[Dict[str, Any]]: 

799 """ 

800 Search for issues with more specific parameters. 

801 

802 Args: 

803 query: Issue search query 

804 state: Filter by issue state ("open", "closed", "all") 

805 sort: Sort order ("updated", "created", "comments") 

806 

807 Returns: 

808 List of issue search results 

809 """ 

810 # Build advanced query 

811 advanced_query = query + f" state:{state}" 

812 

813 # Save current search type 

814 original_search_type = self.search_type 

815 

816 try: 

817 # Set search type to issues 

818 self.search_type = "issues" 

819 self.search_endpoint = f"{self.api_base}/search/issues" 

820 

821 # Set sort parameter 

822 params = { 

823 "q": advanced_query, 

824 "per_page": min(self.max_results, 100), 

825 "page": 1, 

826 "sort": sort, 

827 "order": "desc", 

828 } 

829 

830 # Perform search 

831 response = safe_get( 

832 self.search_endpoint, headers=self.headers, params=params 

833 ) 

834 

835 # Check for rate limiting 

836 self._handle_rate_limits(response) 

837 

838 if response.status_code == 200: 

839 data = response.json() 

840 results = data.get("items", []) 

841 

842 # Format results 

843 return [ 

844 self._format_issue_preview(result) for result in results 

845 ] 

846 

847 # For issues, we don't need to get full content 

848 logger.error( 

849 f"GitHub API error: {response.status_code} - {response.text}" 

850 ) 

851 return [] 

852 

853 finally: 

854 # Restore original search type 

855 self.search_type = original_search_type 

856 self.search_endpoint = f"{self.api_base}/search/{self.search_type}" 

857 

858 def set_search_type(self, search_type: str): 

859 """ 

860 Set the search type for subsequent searches. 

861 

862 Args: 

863 search_type: Type of GitHub search ("repositories", "code", "issues", "users") 

864 """ 

865 if search_type not in _VALID_SEARCH_TYPES: 

866 raise ValueError( 

867 f"Invalid GitHub search_type: {search_type!r}. " 

868 f"Must be one of {_VALID_SEARCH_TYPES}" 

869 ) 

870 self.search_type = search_type 

871 self.search_endpoint = f"{self.api_base}/search/{search_type}" 

872 logger.info(f"Set GitHub search type to: {search_type}") 

873 

874 def _filter_for_relevance( 

875 self, previews: List[Dict[str, Any]], query: str 

876 ) -> List[Dict[str, Any]]: 

877 """ 

878 Filter GitHub search results for relevance using LLM. 

879 

880 Args: 

881 previews: List of preview dictionaries 

882 query: Original search query 

883 

884 Returns: 

885 List of relevant preview dictionaries 

886 """ 

887 if not self.llm or not previews: 

888 return previews 

889 

890 # Create a specialized prompt for GitHub results 

891 prompt = f"""Analyze these GitHub search results and rank them by relevance to the query. 

892Consider: 

8931. Repository stars and activity (higher is better) 

8942. Match between query intent and repository description 

8953. Repository language and topics 

8964. Last update time (more recent is better) 

8975. Whether it's a fork (original repositories are preferred) 

898 

899Query: "{query}" 

900 

901Results: 

902{json.dumps(previews, indent=2)} 

903 

904Return ONLY a JSON array of indices in order of relevance (most relevant first). 

905Example: [0, 2, 1, 3] 

906Do not include any other text or explanation.""" 

907 

908 try: 

909 response = self.llm.invoke(prompt) 

910 response_text = get_llm_response_text(response) 

911 

912 ranked_indices = extract_json(response_text, expected_type=list) 

913 

914 if ranked_indices is not None: 

915 # Return the results in ranked order 

916 ranked_results = [] 

917 for idx in ranked_indices: 

918 if idx < len(previews): 

919 ranked_results.append(previews[idx]) 

920 

921 # Limit to max_filtered_results if specified 

922 if ( 

923 self.max_filtered_results 

924 and len(ranked_results) > self.max_filtered_results 

925 ): 

926 logger.info( 

927 f"Limiting filtered results to top {self.max_filtered_results}" 

928 ) 

929 return ranked_results[: self.max_filtered_results] 

930 

931 return ranked_results 

932 logger.info( 

933 "Could not find JSON array in response, returning no previews" 

934 ) 

935 return [] 

936 

937 except Exception: 

938 logger.exception("Error filtering GitHub results") 

939 return []