Coverage for src / local_deep_research / news / utils / topic_generator.py: 97%

61 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Topic generation utilities for news items. 

3Uses LLM to extract relevant topics/tags from news content. 

4""" 

5 

6from loguru import logger 

7from typing import List 

8import json 

9 

10 

11def generate_topics( 

12 query: str, findings: str = "", category: str = "", max_topics: int = 5 

13) -> List[str]: 

14 """ 

15 Generate relevant topics/tags from news content. 

16 

17 Args: 

18 query: The search query or research question 

19 findings: The research findings/content 

20 category: The news category (if available) 

21 max_topics: Maximum number of topics to generate 

22 

23 Returns: 

24 List of topic strings 

25 """ 

26 # Try LLM generation first 

27 topics = _generate_with_llm(query, findings, category, max_topics) 

28 

29 # No fallback - if LLM fails, mark as missing 

30 if not topics: 

31 topics = ["[Topic generation failed]"] 

32 

33 # Ensure we have valid topics 

34 return _validate_topics(topics, max_topics) 

35 

36 

37def _generate_with_llm( 

38 query: str, findings: str, category: str, max_topics: int 

39) -> List[str]: 

40 """Generate topics using LLM.""" 

41 try: 

42 from ...config.llm_config import get_llm 

43 

44 logger.debug( 

45 f"Topic generation - findings length: {len(findings) if findings else 0}, category: {category}" 

46 ) 

47 

48 # Use the configured model for topic generation 

49 llm = get_llm(temperature=0.5) 

50 

51 # Prepare context 

52 query_preview = query[:500] if len(query) > 500 else query 

53 findings_preview = ( 

54 findings[:1000] if findings and len(findings) > 1000 else findings 

55 ) 

56 

57 prompt = f"""Extract relevant topics/tags from this news content. 

58 

59Query: {query_preview} 

60{f"Content: {findings_preview}" if findings_preview else ""} 

61{f"Category: {category}" if category else ""} 

62 

63Generate {max_topics} specific, relevant topics that would help categorize and filter this news item. 

64 

65Requirements: 

66- Each topic should be 1-3 words 

67- Topics should be specific and meaningful 

68- Include geographic regions if mentioned 

69- Include key entities (countries, organizations, people) 

70- Include event types (conflict, economy, disaster, etc.) 

71- Topics should be diverse and cover different aspects 

72 

73Return ONLY a JSON array of topic strings, like: ["Topic 1", "Topic 2", "Topic 3"]""" 

74 

75 response = llm.invoke(prompt) 

76 content = response.content.strip() 

77 

78 # Try to parse the JSON response 

79 try: 

80 # Clean up common LLM response patterns 

81 if content.startswith("```json"): 

82 content = content[7:] 

83 if content.endswith("```"): 

84 content = content[:-3] 

85 content = content.strip() 

86 

87 topics = json.loads(content) 

88 

89 if isinstance(topics, list): 89 ↛ 111line 89 didn't jump to line 111 because the condition on line 89 was always true

90 # Clean and validate each topic 

91 cleaned_topics = [] 

92 for topic in topics: 

93 if isinstance(topic, str): 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true

94 cleaned = topic.strip() 

95 if cleaned and len(cleaned) <= 30: # Max topic length 

96 cleaned_topics.append(cleaned) 

97 

98 logger.debug(f"Generated topics: {cleaned_topics}") 

99 return cleaned_topics[:max_topics] 

100 

101 except json.JSONDecodeError: 

102 logger.debug(f"Failed to parse LLM topics as JSON: {content}") 

103 # Try to extract topics from plain text response 

104 if "," in content: 104 ↛ 111line 104 didn't jump to line 111 because the condition on line 104 was always true

105 topics = [t.strip().strip("\"'") for t in content.split(",")] 

106 return [t for t in topics if t and len(t) <= 30][:max_topics] 

107 

108 except Exception as e: 

109 logger.debug(f"LLM topic generation failed: {e}") 

110 

111 return [] 

112 

113 

114def _validate_topics(topics: List[str], max_topics: int) -> List[str]: 

115 """Validate and clean topics.""" 

116 valid_topics = [] 

117 seen = set() 

118 

119 for topic in topics: 

120 if not topic: 

121 continue 

122 

123 # Clean the topic 

124 cleaned = topic.strip() 

125 

126 # Skip if too short or too long 

127 if len(cleaned) < 2 or len(cleaned) > 30: 

128 continue 

129 

130 # Skip duplicates (case-insensitive) 

131 normalized = cleaned.lower() 

132 if normalized in seen: 

133 continue 

134 seen.add(normalized) 

135 

136 # Convert to lowercase as djpetti suggested 

137 valid_topics.append(normalized) 

138 

139 if len(valid_topics) >= max_topics: 

140 break 

141 

142 # Don't add default topics - show what actually happened 

143 if not valid_topics: 

144 valid_topics = ["[No valid topics]"] 

145 

146 return valid_topics