Coverage for src / local_deep_research / web_search_engines / engines / search_engine_mojeek.py: 96%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from typing import Any, Dict, List, Optional 

2 

3from langchain_core.language_models import BaseLLM 

4from loguru import logger 

5 

6from ...config import search_config 

7from ...security.safe_requests import safe_get 

8from ..rate_limiting import RateLimitError 

9from ..search_engine_base import BaseSearchEngine 

10 

11 

12class MojeekSearchEngine(BaseSearchEngine): 

13 """ 

14 Mojeek search engine implementation. 

15 

16 Mojeek is a privacy-focused search engine with its own independent 

17 web crawler and index. Requires a paid API key from mojeek.com. 

18 """ 

19 

20 # Mark as public search engine 

21 is_public = True 

22 # Mark as generic search engine (general web search) 

23 is_generic = True 

24 is_lexical = True 

25 needs_llm_relevance_filter = True 

26 

27 def _is_valid_search_result(self, url: str) -> bool: 

28 """ 

29 Check if a URL is a valid absolute HTTP(S) URL. 

30 

31 Returns False for relative URLs, empty strings, or non-HTTP schemes. 

32 """ 

33 if not url or not url.lower().startswith(("http://", "https://")): 

34 return False 

35 return True 

36 

37 def __init__( 

38 self, 

39 max_results: int = 10, 

40 language: str = "en", 

41 region: str = "", 

42 safe_search: bool = False, 

43 api_key: Optional[str] = None, 

44 llm: Optional[BaseLLM] = None, 

45 max_filtered_results: Optional[int] = None, 

46 settings_snapshot: Optional[Dict[str, Any]] = None, 

47 include_full_content: bool = True, 

48 **kwargs, 

49 ): 

50 """ 

51 Initialize the Mojeek search engine. 

52 

53 Args: 

54 max_results: Maximum number of search results 

55 language: Language code in ISO 639-1 format (e.g. 'en', 'fr') 

56 region: Country code in ISO 3166-1 alpha-2 format (e.g. 'GB', 'FR') 

57 safe_search: Whether to enable safe search filtering 

58 api_key: Mojeek API key 

59 llm: Language model for relevance filtering 

60 max_filtered_results: Maximum number of results to keep after filtering 

61 settings_snapshot: Settings snapshot for thread context 

62 include_full_content: Whether to include full webpage content 

63 """ 

64 super().__init__( 

65 llm=llm, 

66 max_filtered_results=max_filtered_results, 

67 max_results=max_results, 

68 include_full_content=include_full_content, 

69 settings_snapshot=settings_snapshot, 

70 **kwargs, 

71 ) 

72 

73 # Get API key - check params, settings, or env vars 

74 mojeek_api_key = self._resolve_api_key( 

75 api_key, 

76 "search.engine.web.mojeek.api_key", 

77 engine_name="Mojeek", 

78 settings_snapshot=settings_snapshot, 

79 ) 

80 

81 self.search_url = "https://api.mojeek.com/search" 

82 self.max_results = max_results 

83 self.language = language 

84 self.region = region 

85 self.safe_search = safe_search 

86 self.api_key = mojeek_api_key 

87 

88 # If full content is requested, initialize FullSearchResults 

89 self._init_full_search( 

90 web_search=self, 

91 language=language, 

92 max_results=max_results, 

93 region=region, 

94 safe_search=safe_search, 

95 time_period="y", 

96 ) 

97 

98 def _get_search_results(self, query: str) -> List[Dict[str, Any]]: 

99 """ 

100 Get search results from the Mojeek API. 

101 

102 Args: 

103 query: The search query 

104 

105 Returns: 

106 List of search result dicts 

107 """ 

108 logger.info(f"Mojeek running search for query: {query}") 

109 

110 try: 

111 params = { 

112 "q": query, 

113 "api_key": self.api_key, 

114 "fmt": "json", 

115 "t": self.max_results, 

116 "safe": 1 if self.safe_search else 0, 

117 } 

118 

119 if self.language: 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was always true

120 params["lb"] = self.language 

121 params["lbb"] = 100 

122 

123 if self.region: 

124 params["rb"] = self.region 

125 params["rbb"] = 10 

126 

127 logger.info(f"Sending request to Mojeek API at {self.search_url}") 

128 

129 response = safe_get( 

130 self.search_url, 

131 params=params, 

132 timeout=15, 

133 ) 

134 

135 if response.status_code == 403: 

136 raise RateLimitError( # noqa: TRY301 — re-raised by except RateLimitError for base class retry 

137 "Mojeek API rate limit hit (403 Forbidden)" 

138 ) 

139 

140 if response.status_code != 200: 

141 logger.warning( 

142 f"Mojeek API returned status {response.status_code}" 

143 ) 

144 return [] 

145 

146 data = response.json() 

147 

148 response_data = data.get("response", {}) 

149 if response_data.get("status") != "OK": 

150 logger.warning( 

151 f"Mojeek API response status: " 

152 f"{response_data.get('status', 'missing')}" 

153 ) 

154 return [] 

155 

156 raw_results = response_data.get("results", []) 

157 results = [] 

158 for result in raw_results: 

159 url = result.get("url", "") 

160 if not self._is_valid_search_result(url): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 continue 

162 results.append( 

163 { 

164 "title": result.get("title", ""), 

165 "url": url, 

166 "content": result.get("desc", ""), 

167 "engine": "mojeek", 

168 "category": result.get("cats", ""), 

169 } 

170 ) 

171 

172 if results: 

173 logger.info(f"Mojeek returned {len(results)} valid results") 

174 else: 

175 logger.warning( 

176 f"Mojeek returned no valid results for query: {query}" 

177 ) 

178 

179 return results 

180 

181 except RateLimitError: 

182 raise 

183 except Exception: 

184 logger.exception("Error when searching using Mojeek") 

185 return [] 

186 

187 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

188 """ 

189 Get preview information for Mojeek search results. 

190 

191 Args: 

192 query: The search query 

193 

194 Returns: 

195 List of preview dictionaries 

196 """ 

197 logger.info(f"Getting Mojeek previews for query: {query}") 

198 

199 results = self._get_search_results(query) 

200 

201 if not results: 

202 logger.warning(f"No Mojeek results found for query: {query}") 

203 return [] 

204 

205 previews = [] 

206 for i, result in enumerate(results): 

207 preview = { 

208 "id": result.get("url", "") or f"mojeek-result-{i}", 

209 "title": result.get("title", ""), 

210 "link": result.get("url", ""), 

211 "snippet": result.get("content", ""), 

212 "engine": result.get("engine", "mojeek"), 

213 "category": result.get("category", ""), 

214 } 

215 previews.append(preview) 

216 

217 return previews 

218 

219 def _get_full_content( 

220 self, relevant_items: List[Dict[str, Any]] 

221 ) -> List[Dict[str, Any]]: 

222 """ 

223 Get full content for the relevant search results. 

224 

225 Args: 

226 relevant_items: List of relevant preview dictionaries 

227 

228 Returns: 

229 List of result dictionaries with full content 

230 """ 

231 if ( 

232 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

233 and search_config.SEARCH_SNIPPETS_ONLY 

234 ): 

235 logger.info("Snippet-only mode, skipping full content retrieval") 

236 return relevant_items 

237 

238 if self.include_full_content and hasattr(self, "full_search"): 

239 logger.info("Retrieving full webpage content") 

240 try: 

241 return self.full_search._get_full_content(relevant_items) 

242 except Exception: 

243 logger.exception("Error retrieving full content") 

244 

245 return relevant_items