Coverage for src/local_deep_research/web_search_engines/engines/search_engine_github.py: 99%
330 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1import base64
2import json
3import time
4from typing import Any, Dict, List, Optional
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...config import llm_config, search_config
10from ...constants import USER_AGENT
11from ...security.safe_requests import safe_get
12from ...utilities.json_utils import extract_json, get_llm_response_text
13from ..search_engine_base import BaseSearchEngine
16_VALID_SEARCH_TYPES = frozenset({"repositories", "code", "issues", "users"})
19class GitHubSearchEngine(BaseSearchEngine):
20 """
21 GitHub search engine implementation.
22 Provides search across GitHub repositories, code, issues, and users.
23 """
25 is_lexical = True
26 needs_llm_relevance_filter = True
28 def __init__(
29 self,
30 max_results: int = 15,
31 api_key: Optional[str] = None,
32 search_type: str = "repositories",
33 include_readme: bool = True,
34 include_issues: bool = False,
35 llm: Optional[BaseLLM] = None,
36 max_filtered_results: Optional[int] = None,
37 settings_snapshot: Optional[Dict[str, Any]] = None,
38 ):
39 """
40 Initialize the GitHub search engine.
42 Args:
43 max_results: Maximum number of search results
44 api_key: GitHub API token (can also be set via LDR_SEARCH_ENGINE_WEB_GITHUB_API_KEY env var or in UI settings)
45 search_type: Type of GitHub search ("repositories", "code", "issues", "users")
46 include_readme: Whether to include README content for repositories
47 include_issues: Whether to include recent issues for repositories
48 llm: Language model for relevance filtering
49 max_filtered_results: Maximum number of results to keep after filtering
50 """
51 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
52 super().__init__(
53 llm=llm,
54 max_filtered_results=max_filtered_results,
55 max_results=max_results,
56 settings_snapshot=settings_snapshot,
57 )
58 self.api_key = api_key
59 if search_type not in _VALID_SEARCH_TYPES:
60 raise ValueError(
61 f"Invalid GitHub search_type: {search_type!r}. "
62 f"Must be one of {_VALID_SEARCH_TYPES}"
63 )
64 self.search_type = search_type
65 self.include_readme = include_readme
66 self.include_issues = include_issues
68 self._owns_llm = False
70 # API endpoints
71 self.api_base = "https://api.github.com"
72 self.search_endpoint = f"{self.api_base}/search/{search_type}"
74 # Set up API headers
75 self.headers = {
76 "Accept": "application/vnd.github.v3+json",
77 "User-Agent": USER_AGENT,
78 }
80 # Add authentication if API key provided
81 if self.api_key:
82 self.headers["Authorization"] = f"token {self.api_key}"
83 logger.info("Using authenticated GitHub API requests")
84 else:
85 logger.warning(
86 "No GitHub API key provided. Rate limits will be restricted."
87 )
89 def close(self) -> None:
90 """Close the lazily-loaded LLM client if this engine created it."""
91 from ...utilities.resource_utils import safe_close
93 if self._owns_llm:
94 safe_close(self.llm, "GitHub LLM")
95 super().close()
97 def _handle_rate_limits(self, response):
98 """Handle GitHub API rate limits by logging warnings and sleeping if necessary"""
99 remaining = int(response.headers.get("X-RateLimit-Remaining", 60))
100 reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
102 if remaining < 5:
103 current_time = time.time()
104 wait_time = max(reset_time - current_time, 0)
105 logger.warning(
106 f"GitHub API rate limit almost reached. {remaining} requests remaining."
107 )
109 if wait_time > 0 and remaining == 0:
110 logger.warning(
111 f"GitHub API rate limit exceeded. Waiting {wait_time:.0f} seconds."
112 )
113 time.sleep(min(wait_time, 60)) # Wait at most 60 seconds
115 def _optimize_github_query(self, query: str) -> str:
116 """
117 Optimize the GitHub search query using LLM to improve search results.
119 Args:
120 query: Original search query
122 Returns:
123 Optimized GitHub search query
124 """
125 # Get LLM from config if not already set
126 if not self.llm:
127 try:
128 self.llm = llm_config.get_llm()
129 self._owns_llm = True
130 if not self.llm:
131 logger.warning("No LLM available for query optimization")
132 return query
133 except Exception:
134 logger.exception("Error getting LLM from config")
135 return query
137 prompt = f"""Transform this GitHub search query into an optimized version for the GitHub search API. Follow these steps:
138 1. Strip question words (e.g., 'what', 'are', 'is'), stop words (e.g., 'and', 'as', 'of', 'on'), and redundant terms (e.g., 'repositories', 'repos', 'github') since they're implied by the search context.
139 2. Keep only domain-specific keywords and avoid using "-related" terms.
140 3. Add GitHub-specific filters with dynamic thresholds based on query context:
141 - For stars: Use higher threshold (e.g., 'stars:>1000') for mainstream topics, lower (e.g., 'stars:>50') for specialized topics
142 - For language: Detect programming language from query or omit if unclear
143 - For search scope: Use 'in:name,description,readme' for general queries, 'in:file' for code-specific queries
144 4. For date ranges, adapt based on query context:
145 - For emerging: Use 'created:>2024-01-01'
146 - For mature: Use 'pushed:>2023-01-01'
147 - For historical research: Use 'created:2020-01-01..2024-01-01'
148 5. For excluding results, adapt based on query:
149 - Exclude irrelevant languages based on context
150 - Use 'NOT' to exclude competing terms
151 6. Ensure the output is a concise, space-separated string with no punctuation or extra text beyond keywords and filters.
154 Original query: "{query}"
156 Return ONLY the optimized query, ready for GitHub's search API. Do not include explanations or additional text."""
158 try:
159 response = self.llm.invoke(prompt)
161 # Handle different response formats (string or object with content attribute)
162 if hasattr(response, "content"):
163 optimized_query = str(response.content).strip()
164 else:
165 # Handle string responses
166 optimized_query = str(response).strip()
168 # Validate the optimized query
169 if optimized_query and len(optimized_query) > 0:
170 logger.info(
171 f"LLM optimized query from '{query}' to '{optimized_query}'"
172 )
173 return optimized_query
174 logger.warning("LLM returned empty query, using original")
175 return query
177 except Exception:
178 logger.exception("Error optimizing query with LLM")
179 return query
181 def _search_github(self, query: str) -> List[Dict[str, Any]]:
182 """
183 Perform a GitHub search based on the configured search type.
185 Args:
186 query: The search query
188 Returns:
189 List of GitHub search result items
190 """
191 results = []
193 try:
194 # Optimize GitHub query using LLM
195 github_query = self._optimize_github_query(query)
197 logger.info(f"Final GitHub query: {github_query}")
199 # Construct search parameters
200 params = {
201 "q": github_query,
202 "per_page": min(
203 self.max_results, 100
204 ), # GitHub API max is 100 per page
205 "page": 1,
206 }
208 # Add sort parameters based on search type
209 if self.search_type == "repositories":
210 params["sort"] = "stars"
211 params["order"] = "desc"
212 elif self.search_type == "code":
213 params["sort"] = "indexed"
214 params["order"] = "desc"
215 elif self.search_type == "issues":
216 params["sort"] = "updated"
217 params["order"] = "desc"
218 elif self.search_type == "users": 218 ↛ 223line 218 didn't jump to line 223 because the condition on line 218 was always true
219 params["sort"] = "followers"
220 params["order"] = "desc"
222 # Apply rate limiting before request
223 self._last_wait_time = self.rate_tracker.apply_rate_limit(
224 self.engine_type
225 )
227 # Execute the API request
228 response = safe_get(
229 self.search_endpoint, headers=self.headers, params=params
230 )
232 # Check for rate limiting
233 self._handle_rate_limits(response)
235 # Handle response with detailed logging
236 if response.status_code == 200:
237 data = response.json()
238 total_count = data.get("total_count", 0)
239 results = data.get("items", [])
240 logger.info(
241 f"GitHub search returned {len(results)} results (total available: {total_count})"
242 )
244 # Log the rate limit information
245 rate_limit_remaining = response.headers.get(
246 "X-RateLimit-Remaining", "unknown"
247 )
248 logger.info(
249 f"GitHub API rate limit: {rate_limit_remaining} requests remaining"
250 )
252 # If no results, try to provide more guidance
253 if not results:
254 logger.warning(
255 "No results found. Consider these search tips:"
256 )
257 logger.warning("1. Use shorter, more specific queries")
258 logger.warning(
259 "2. For repositories, try adding 'stars:>100' or 'language:python'"
260 )
261 logger.warning(
262 "3. For contribution opportunities, search for 'good-first-issue' or 'help-wanted'"
263 )
264 else:
265 logger.error(
266 f"GitHub API error: {response.status_code} - {response.text}"
267 )
269 except Exception:
270 logger.exception("Error searching GitHub")
272 return results
274 def _get_readme_content(self, repo_full_name: str) -> str:
275 """
276 Get README content for a repository.
278 Args:
279 repo_full_name: Full name of the repository (owner/repo)
281 Returns:
282 Decoded README content or empty string if not found
283 """
284 try:
285 # Get README
286 # Apply rate limiting before request
287 self._last_wait_time = self.rate_tracker.apply_rate_limit(
288 self.engine_type
289 )
291 response = safe_get(
292 f"{self.api_base}/repos/{repo_full_name}/readme",
293 headers=self.headers,
294 )
296 # Check for rate limiting
297 self._handle_rate_limits(response)
299 if response.status_code == 200:
300 data = response.json()
301 content: str = data.get("content", "")
302 encoding = data.get("encoding", "")
304 if encoding == "base64" and content:
305 return base64.b64decode(content).decode(
306 "utf-8", errors="replace"
307 )
308 return content
309 logger.warning(
310 f"Could not get README for {repo_full_name}: {response.status_code}"
311 )
312 return ""
314 except Exception:
315 logger.exception(f"Error getting README for {repo_full_name}")
316 return ""
318 def _get_recent_issues(
319 self, repo_full_name: str, limit: int = 5
320 ) -> List[Dict[str, Any]]:
321 """
322 Get recent issues for a repository.
324 Args:
325 repo_full_name: Full name of the repository (owner/repo)
326 limit: Maximum number of issues to return
328 Returns:
329 List of recent issues
330 """
331 issues = []
333 try:
334 # Get recent issues
335 # Apply rate limiting before request
336 self._last_wait_time = self.rate_tracker.apply_rate_limit(
337 self.engine_type
338 )
340 response = safe_get(
341 f"{self.api_base}/repos/{repo_full_name}/issues",
342 headers=self.headers,
343 params={
344 "state": "all",
345 "per_page": limit,
346 "sort": "updated",
347 "direction": "desc",
348 },
349 )
351 # Check for rate limiting
352 self._handle_rate_limits(response)
354 if response.status_code == 200:
355 issues = response.json()
356 logger.info(
357 f"Got {len(issues)} recent issues for {repo_full_name}"
358 )
359 else:
360 logger.warning(
361 f"Could not get issues for {repo_full_name}: {response.status_code}"
362 )
364 except Exception:
365 logger.exception(f"Error getting issues for {repo_full_name}")
367 return issues
369 def _get_file_content(self, file_url: str) -> str:
370 """
371 Get content of a file from GitHub.
373 Args:
374 file_url: API URL for the file
376 Returns:
377 Decoded file content or empty string if not found
378 """
379 try:
380 # Apply rate limiting before request
381 self._last_wait_time = self.rate_tracker.apply_rate_limit(
382 self.engine_type
383 )
385 # Get file content
386 response = safe_get(file_url, headers=self.headers)
388 # Check for rate limiting
389 self._handle_rate_limits(response)
391 if response.status_code == 200:
392 data = response.json()
393 content2: str = data.get("content", "")
394 encoding = data.get("encoding", "")
396 if encoding == "base64" and content2:
397 return base64.b64decode(content2).decode(
398 "utf-8", errors="replace"
399 )
400 return content2
401 logger.warning(
402 f"Could not get file content: {response.status_code}"
403 )
404 return ""
406 except Exception:
407 logger.exception("Error getting file content")
408 return ""
410 def _format_repository_preview(
411 self, repo: Dict[str, Any]
412 ) -> Dict[str, Any]:
413 """Format repository search result as preview"""
414 return {
415 "id": str(repo.get("id", "")),
416 "title": repo.get("full_name", ""),
417 "link": repo.get("html_url", ""),
418 "snippet": repo.get("description", "No description provided"),
419 "stars": repo.get("stargazers_count", 0),
420 "forks": repo.get("forks_count", 0),
421 "language": repo.get("language", ""),
422 "updated_at": repo.get("updated_at", ""),
423 "created_at": repo.get("created_at", ""),
424 "topics": repo.get("topics", []),
425 "owner": repo.get("owner", {}).get("login", ""),
426 "is_fork": repo.get("fork", False),
427 "search_type": "repository",
428 "repo_full_name": repo.get("full_name", ""),
429 }
431 def _format_code_preview(self, code: Dict[str, Any]) -> Dict[str, Any]:
432 """Format code search result as preview"""
433 repo = code.get("repository", {})
434 return {
435 "id": f"code_{code.get('sha', '')}",
436 "title": f"{code.get('name', '')} in {repo.get('full_name', '')}",
437 "link": code.get("html_url", ""),
438 "snippet": f"Match in {code.get('path', '')}",
439 "path": code.get("path", ""),
440 "repo_name": repo.get("full_name", ""),
441 "repo_url": repo.get("html_url", ""),
442 "search_type": "code",
443 "file_url": code.get("url", ""),
444 }
446 def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]:
447 """Format issue search result as preview"""
448 repo = (
449 issue.get("repository", {})
450 if "repository" in issue
451 else {"full_name": ""}
452 )
453 return {
454 "id": f"issue_{issue.get('number', '')}",
455 "title": issue.get("title", ""),
456 "link": issue.get("html_url", ""),
457 "snippet": (
458 issue.get("body", "")[:200] + "..."
459 if len(issue.get("body", "")) > 200
460 else issue.get("body", "")
461 ),
462 "state": issue.get("state", ""),
463 "created_at": issue.get("created_at", ""),
464 "updated_at": issue.get("updated_at", ""),
465 "user": issue.get("user", {}).get("login", ""),
466 "comments": issue.get("comments", 0),
467 "search_type": "issue",
468 "repo_name": repo.get("full_name", ""),
469 }
471 def _format_user_preview(self, user: Dict[str, Any]) -> Dict[str, Any]:
472 """Format user search result as preview"""
473 return {
474 "id": f"user_{user.get('id', '')}",
475 "title": user.get("login", ""),
476 "link": user.get("html_url", ""),
477 "snippet": user.get("bio", "No bio provided"),
478 "name": user.get("name", ""),
479 "followers": user.get("followers", 0),
480 "public_repos": user.get("public_repos", 0),
481 "location": user.get("location", ""),
482 "search_type": "user",
483 "user_login": user.get("login", ""),
484 }
486 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
487 """
488 Get preview information for GitHub search results.
490 Args:
491 query: The search query
493 Returns:
494 List of preview dictionaries
495 """
496 logger.info(f"Getting GitHub previews for query: {query}")
498 # For contribution-focused queries, automatically adjust search type and add filters
499 if any(
500 term in query.lower()
501 for term in [
502 "contribute",
503 "contributing",
504 "contribution",
505 "beginner",
506 "newcomer",
507 ]
508 ):
509 # Use repositories search with help-wanted or good-first-issue labels
510 original_search_type = self.search_type
511 self.search_type = "repositories"
512 self.search_endpoint = f"{self.api_base}/search/repositories"
514 # Create a specialized query for finding beginner-friendly projects
515 specialized_query = "good-first-issues:>5 is:public archived:false"
517 # Extract language preferences if present
518 languages = []
519 for lang in [
520 "python",
521 "javascript",
522 "java",
523 "rust",
524 "go",
525 "typescript",
526 "c#",
527 "c++",
528 "ruby",
529 ]:
530 if lang in query.lower():
531 languages.append(lang)
533 if languages:
534 specialized_query += f" language:{' language:'.join(languages)}"
536 # Extract keywords
537 keywords = [
538 word
539 for word in query.split()
540 if len(word) > 3
541 and word.lower()
542 not in [
543 "recommend",
544 "recommended",
545 "github",
546 "repositories",
547 "looking",
548 "developers",
549 "contribute",
550 "contributing",
551 "beginner",
552 "newcomer",
553 ]
554 ]
556 if keywords: 556 ↛ 561line 556 didn't jump to line 561 because the condition on line 556 was always true
557 specialized_query += " " + " ".join(
558 keywords[:5]
559 ) # Add up to 5 keywords
561 logger.info(
562 f"Using specialized contribution query: {specialized_query}"
563 )
565 # Perform GitHub search with specialized query
566 results = self._search_github(specialized_query)
568 # Restore original search type
569 self.search_type = original_search_type
570 self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
571 else:
572 # Perform standard GitHub search
573 results = self._search_github(query)
575 if not results:
576 logger.warning(f"No GitHub results found for query: {query}")
577 return []
579 # Format results as previews
580 previews = []
581 for result in results:
582 # Format based on search type
583 if self.search_type == "repositories":
584 preview = self._format_repository_preview(result)
585 elif self.search_type == "code":
586 preview = self._format_code_preview(result)
587 elif self.search_type == "issues":
588 preview = self._format_issue_preview(result)
589 elif self.search_type == "users": 589 ↛ 592line 589 didn't jump to line 592 because the condition on line 589 was always true
590 preview = self._format_user_preview(result)
591 else:
592 logger.warning(f"Unknown search type: {self.search_type}")
593 continue
595 previews.append(preview)
597 logger.info(f"Formatted {len(previews)} GitHub preview results")
598 return previews
600 def _get_full_content(
601 self, relevant_items: List[Dict[str, Any]]
602 ) -> List[Dict[str, Any]]:
603 """
604 Get full content for the relevant GitHub search results.
606 Args:
607 relevant_items: List of relevant preview dictionaries
609 Returns:
610 List of result dictionaries with full content
611 """
612 # Check if we should add full content
613 if (
614 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
615 and search_config.SEARCH_SNIPPETS_ONLY
616 ):
617 logger.info("Snippet-only mode, skipping full content retrieval")
618 return relevant_items
620 logger.info(
621 f"Getting full content for {len(relevant_items)} GitHub results"
622 )
624 results = []
625 for item in relevant_items:
626 result = item.copy()
627 search_type = item.get("search_type", "")
629 # Add content based on search type
630 if search_type == "repository" and self.include_readme:
631 repo_full_name = item.get("repo_full_name", "")
632 if repo_full_name:
633 # Get README content
634 readme_content = self._get_readme_content(repo_full_name)
635 result["full_content"] = readme_content
636 result["content_type"] = "readme"
638 # Get recent issues if requested
639 if self.include_issues:
640 issues = self._get_recent_issues(repo_full_name)
641 result["recent_issues"] = issues
643 elif search_type == "code":
644 file_url = item.get("file_url", "")
645 if file_url:
646 # Get file content
647 file_content = self._get_file_content(file_url)
648 result["full_content"] = file_content
649 result["content_type"] = "file"
651 elif search_type == "issue":
652 # For issues, the snippet usually contains a summary already
653 # We'll just keep it as is
654 result["full_content"] = item.get("snippet", "")
655 result["content_type"] = "issue"
657 elif search_type == "user":
658 # For users, construct a profile summary
659 profile_summary = f"GitHub user: {item.get('title', '')}\n"
661 if item.get("name"):
662 profile_summary += f"Name: {item.get('name')}\n"
664 if item.get("location"):
665 profile_summary += f"Location: {item.get('location')}\n"
667 profile_summary += f"Followers: {item.get('followers', 0)}\n"
668 profile_summary += (
669 f"Public repositories: {item.get('public_repos', 0)}\n"
670 )
672 if (
673 item.get("snippet")
674 and item.get("snippet") != "No bio provided"
675 ):
676 profile_summary += f"\nBio: {item.get('snippet')}\n"
678 result["full_content"] = profile_summary
679 result["content_type"] = "user_profile"
681 results.append(result)
683 return results
685 def search_repository(
686 self, repo_owner: str, repo_name: str
687 ) -> Dict[str, Any]:
688 """
689 Get detailed information about a specific repository.
691 Args:
692 repo_owner: Owner of the repository
693 repo_name: Name of the repository
695 Returns:
696 Dictionary with repository information
697 """
698 repo_full_name = f"{repo_owner}/{repo_name}"
699 logger.info(f"Getting details for repository: {repo_full_name}")
701 try:
702 # Get repository details
703 # Apply rate limiting before request
704 self._last_wait_time = self.rate_tracker.apply_rate_limit(
705 self.engine_type
706 )
708 response = safe_get(
709 f"{self.api_base}/repos/{repo_full_name}", headers=self.headers
710 )
712 # Check for rate limiting
713 self._handle_rate_limits(response)
715 if response.status_code == 200:
716 repo = response.json()
718 # Format as repository preview
719 result = self._format_repository_preview(repo)
721 # Add README content if requested
722 if self.include_readme:
723 readme_content = self._get_readme_content(repo_full_name)
724 result["full_content"] = readme_content
725 result["content_type"] = "readme"
727 # Add recent issues if requested
728 if self.include_issues:
729 issues = self._get_recent_issues(repo_full_name)
730 result["recent_issues"] = issues
732 return result
733 logger.error(
734 f"Error getting repository details: {response.status_code} - {response.text}"
735 )
736 return {}
738 except Exception:
739 logger.exception("Error getting repository details")
740 return {}
742 def search_code(
743 self,
744 query: str,
745 language: Optional[str] = None,
746 user: Optional[str] = None,
747 ) -> List[Dict[str, Any]]:
748 """
749 Search for code with more specific parameters.
751 Args:
752 query: Code search query
753 language: Filter by programming language
754 user: Filter by GitHub username/organization
756 Returns:
757 List of code search results
758 """
759 # Build advanced query
760 advanced_query = query
762 if language:
763 advanced_query += f" language:{language}"
765 if user:
766 advanced_query += f" user:{user}"
768 # Save current search type
769 original_search_type = self.search_type
771 try:
772 # Set search type to code
773 self.search_type = "code"
774 self.search_endpoint = f"{self.api_base}/search/code"
776 # Perform search
777 results = self._search_github(advanced_query)
779 # Format results
780 previews = [self._format_code_preview(result) for result in results]
782 # Get full content if requested
783 if (
784 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
785 and not search_config.SEARCH_SNIPPETS_ONLY
786 ):
787 return self._get_full_content(previews)
789 return previews
791 finally:
792 # Restore original search type
793 self.search_type = original_search_type
794 self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
796 def search_issues(
797 self, query: str, state: str = "open", sort: str = "updated"
798 ) -> List[Dict[str, Any]]:
799 """
800 Search for issues with more specific parameters.
802 Args:
803 query: Issue search query
804 state: Filter by issue state ("open", "closed", "all")
805 sort: Sort order ("updated", "created", "comments")
807 Returns:
808 List of issue search results
809 """
810 # Build advanced query
811 advanced_query = query + f" state:{state}"
813 # Save current search type
814 original_search_type = self.search_type
816 try:
817 # Set search type to issues
818 self.search_type = "issues"
819 self.search_endpoint = f"{self.api_base}/search/issues"
821 # Set sort parameter
822 params = {
823 "q": advanced_query,
824 "per_page": min(self.max_results, 100),
825 "page": 1,
826 "sort": sort,
827 "order": "desc",
828 }
830 # Perform search
831 response = safe_get(
832 self.search_endpoint, headers=self.headers, params=params
833 )
835 # Check for rate limiting
836 self._handle_rate_limits(response)
838 if response.status_code == 200:
839 data = response.json()
840 results = data.get("items", [])
842 # Format results
843 return [
844 self._format_issue_preview(result) for result in results
845 ]
847 # For issues, we don't need to get full content
848 logger.error(
849 f"GitHub API error: {response.status_code} - {response.text}"
850 )
851 return []
853 finally:
854 # Restore original search type
855 self.search_type = original_search_type
856 self.search_endpoint = f"{self.api_base}/search/{self.search_type}"
858 def set_search_type(self, search_type: str):
859 """
860 Set the search type for subsequent searches.
862 Args:
863 search_type: Type of GitHub search ("repositories", "code", "issues", "users")
864 """
865 if search_type not in _VALID_SEARCH_TYPES:
866 raise ValueError(
867 f"Invalid GitHub search_type: {search_type!r}. "
868 f"Must be one of {_VALID_SEARCH_TYPES}"
869 )
870 self.search_type = search_type
871 self.search_endpoint = f"{self.api_base}/search/{search_type}"
872 logger.info(f"Set GitHub search type to: {search_type}")
874 def _filter_for_relevance(
875 self, previews: List[Dict[str, Any]], query: str
876 ) -> List[Dict[str, Any]]:
877 """
878 Filter GitHub search results for relevance using LLM.
880 Args:
881 previews: List of preview dictionaries
882 query: Original search query
884 Returns:
885 List of relevant preview dictionaries
886 """
887 if not self.llm or not previews:
888 return previews
890 # Create a specialized prompt for GitHub results
891 prompt = f"""Analyze these GitHub search results and rank them by relevance to the query.
892Consider:
8931. Repository stars and activity (higher is better)
8942. Match between query intent and repository description
8953. Repository language and topics
8964. Last update time (more recent is better)
8975. Whether it's a fork (original repositories are preferred)
899Query: "{query}"
901Results:
902{json.dumps(previews, indent=2)}
904Return ONLY a JSON array of indices in order of relevance (most relevant first).
905Example: [0, 2, 1, 3]
906Do not include any other text or explanation."""
908 try:
909 response = self.llm.invoke(prompt)
910 response_text = get_llm_response_text(response)
912 ranked_indices = extract_json(response_text, expected_type=list)
914 if ranked_indices is not None:
915 # Return the results in ranked order
916 ranked_results = []
917 for idx in ranked_indices:
918 if idx < len(previews):
919 ranked_results.append(previews[idx])
921 # Limit to max_filtered_results if specified
922 if (
923 self.max_filtered_results
924 and len(ranked_results) > self.max_filtered_results
925 ):
926 logger.info(
927 f"Limiting filtered results to top {self.max_filtered_results}"
928 )
929 return ranked_results[: self.max_filtered_results]
931 return ranked_results
932 logger.info(
933 "Could not find JSON array in response, returning no previews"
934 )
935 return []
937 except Exception:
938 logger.exception("Error filtering GitHub results")
939 return []