Coverage for src / local_deep_research / news / utils / topic_generator.py: 100%

57 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Topic generation utilities for news items. 

3Uses LLM to extract relevant topics/tags from news content. 

4""" 

5 

6from loguru import logger 

7from typing import List 

8 

9from ...utilities.json_utils import extract_json, get_llm_response_text 

10 

11 

12def generate_topics( 

13 query: str, findings: str = "", category: str = "", max_topics: int = 5 

14) -> List[str]: 

15 """ 

16 Generate relevant topics/tags from news content. 

17 

18 Args: 

19 query: The search query or research question 

20 findings: The research findings/content 

21 category: The news category (if available) 

22 max_topics: Maximum number of topics to generate 

23 

24 Returns: 

25 List of topic strings 

26 """ 

27 # Try LLM generation first 

28 topics = _generate_with_llm(query, findings, category, max_topics) 

29 

30 # No fallback - if LLM fails, mark as missing 

31 if not topics: 

32 topics = ["[Topic generation failed]"] 

33 

34 # Ensure we have valid topics 

35 return _validate_topics(topics, max_topics) 

36 

37 

38def _generate_with_llm( 

39 query: str, findings: str, category: str, max_topics: int 

40) -> List[str]: 

41 """Generate topics using LLM.""" 

42 try: 

43 from ...config.llm_config import get_llm 

44 

45 logger.debug( 

46 f"Topic generation - findings length: {len(findings) if findings else 0}, category: {category}" 

47 ) 

48 

49 # Use the configured model for topic generation 

50 llm = get_llm(temperature=0.5) 

51 

52 try: 

53 # Prepare context 

54 query_preview = query[:500] if len(query) > 500 else query 

55 findings_preview = ( 

56 findings[:1000] 

57 if findings and len(findings) > 1000 

58 else findings 

59 ) 

60 

61 prompt = f"""Extract relevant topics/tags from this news content. 

62 

63Query: {query_preview} 

64{f"Content: {findings_preview}" if findings_preview else ""} 

65{f"Category: {category}" if category else ""} 

66 

67Generate {max_topics} specific, relevant topics that would help categorize and filter this news item. 

68 

69Requirements: 

70- Each topic should be 1-3 words 

71- Topics should be specific and meaningful 

72- Include geographic regions if mentioned 

73- Include key entities (countries, organizations, people) 

74- Include event types (conflict, economy, disaster, etc.) 

75- Topics should be diverse and cover different aspects 

76 

77Return ONLY a JSON array of topic strings, like: ["Topic 1", "Topic 2", "Topic 3"]""" 

78 

79 response = llm.invoke(prompt) 

80 content = get_llm_response_text(response) 

81 

82 # Try to parse the JSON response 

83 topics = extract_json(content, expected_type=list) 

84 

85 if topics is not None: 

86 # Clean and validate each topic 

87 cleaned_topics = [] 

88 for topic in topics: 

89 if isinstance(topic, str): 

90 cleaned = topic.strip() 

91 if cleaned and len(cleaned) <= 30: # Max topic length 

92 cleaned_topics.append(cleaned) 

93 

94 logger.debug(f"Generated topics: {cleaned_topics}") 

95 return cleaned_topics[:max_topics] 

96 

97 # Try to extract topics from plain text response 

98 logger.debug(f"Failed to parse LLM topics as JSON: {content}") 

99 if "," in content: 

100 topics = [t.strip().strip("\"'") for t in content.split(",")] 

101 return [t for t in topics if t and len(t) <= 30][:max_topics] 

102 finally: 

103 from ...utilities.resource_utils import safe_close 

104 

105 safe_close(llm, "topic LLM") 

106 

107 except Exception as e: 

108 logger.debug(f"LLM topic generation failed: {e}") 

109 

110 return [] 

111 

112 

113def _validate_topics(topics: List[str], max_topics: int) -> List[str]: 

114 """Validate and clean topics.""" 

115 valid_topics = [] 

116 seen = set() 

117 

118 for topic in topics: 

119 if not topic: 

120 continue 

121 

122 # Clean the topic 

123 cleaned = topic.strip() 

124 

125 # Skip if too short or too long 

126 if len(cleaned) < 2 or len(cleaned) > 30: 

127 continue 

128 

129 # Skip duplicates (case-insensitive) 

130 normalized = cleaned.lower() 

131 if normalized in seen: 

132 continue 

133 seen.add(normalized) 

134 

135 # Convert to lowercase as djpetti suggested 

136 valid_topics.append(normalized) 

137 

138 if len(valid_topics) >= max_topics: 

139 break 

140 

141 # Don't add default topics - show what actually happened 

142 if not valid_topics: 

143 valid_topics = ["[No valid topics]"] 

144 

145 return valid_topics