Coverage for src / local_deep_research / news / utils / topic_generator.py: 97%
61 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Topic generation utilities for news items.
3Uses LLM to extract relevant topics/tags from news content.
4"""
6from loguru import logger
7from typing import List
8import json
11def generate_topics(
12 query: str, findings: str = "", category: str = "", max_topics: int = 5
13) -> List[str]:
14 """
15 Generate relevant topics/tags from news content.
17 Args:
18 query: The search query or research question
19 findings: The research findings/content
20 category: The news category (if available)
21 max_topics: Maximum number of topics to generate
23 Returns:
24 List of topic strings
25 """
26 # Try LLM generation first
27 topics = _generate_with_llm(query, findings, category, max_topics)
29 # No fallback - if LLM fails, mark as missing
30 if not topics:
31 topics = ["[Topic generation failed]"]
33 # Ensure we have valid topics
34 return _validate_topics(topics, max_topics)
37def _generate_with_llm(
38 query: str, findings: str, category: str, max_topics: int
39) -> List[str]:
40 """Generate topics using LLM."""
41 try:
42 from ...config.llm_config import get_llm
44 logger.debug(
45 f"Topic generation - findings length: {len(findings) if findings else 0}, category: {category}"
46 )
48 # Use the configured model for topic generation
49 llm = get_llm(temperature=0.5)
51 # Prepare context
52 query_preview = query[:500] if len(query) > 500 else query
53 findings_preview = (
54 findings[:1000] if findings and len(findings) > 1000 else findings
55 )
57 prompt = f"""Extract relevant topics/tags from this news content.
59Query: {query_preview}
60{f"Content: {findings_preview}" if findings_preview else ""}
61{f"Category: {category}" if category else ""}
63Generate {max_topics} specific, relevant topics that would help categorize and filter this news item.
65Requirements:
66- Each topic should be 1-3 words
67- Topics should be specific and meaningful
68- Include geographic regions if mentioned
69- Include key entities (countries, organizations, people)
70- Include event types (conflict, economy, disaster, etc.)
71- Topics should be diverse and cover different aspects
73Return ONLY a JSON array of topic strings, like: ["Topic 1", "Topic 2", "Topic 3"]"""
75 response = llm.invoke(prompt)
76 content = response.content.strip()
78 # Try to parse the JSON response
79 try:
80 # Clean up common LLM response patterns
81 if content.startswith("```json"):
82 content = content[7:]
83 if content.endswith("```"):
84 content = content[:-3]
85 content = content.strip()
87 topics = json.loads(content)
89 if isinstance(topics, list): 89 ↛ 111line 89 didn't jump to line 111 because the condition on line 89 was always true
90 # Clean and validate each topic
91 cleaned_topics = []
92 for topic in topics:
93 if isinstance(topic, str): 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true
94 cleaned = topic.strip()
95 if cleaned and len(cleaned) <= 30: # Max topic length
96 cleaned_topics.append(cleaned)
98 logger.debug(f"Generated topics: {cleaned_topics}")
99 return cleaned_topics[:max_topics]
101 except json.JSONDecodeError:
102 logger.debug(f"Failed to parse LLM topics as JSON: {content}")
103 # Try to extract topics from plain text response
104 if "," in content: 104 ↛ 111line 104 didn't jump to line 111 because the condition on line 104 was always true
105 topics = [t.strip().strip("\"'") for t in content.split(",")]
106 return [t for t in topics if t and len(t) <= 30][:max_topics]
108 except Exception as e:
109 logger.debug(f"LLM topic generation failed: {e}")
111 return []
114def _validate_topics(topics: List[str], max_topics: int) -> List[str]:
115 """Validate and clean topics."""
116 valid_topics = []
117 seen = set()
119 for topic in topics:
120 if not topic:
121 continue
123 # Clean the topic
124 cleaned = topic.strip()
126 # Skip if too short or too long
127 if len(cleaned) < 2 or len(cleaned) > 30:
128 continue
130 # Skip duplicates (case-insensitive)
131 normalized = cleaned.lower()
132 if normalized in seen:
133 continue
134 seen.add(normalized)
136 # Convert to lowercase as djpetti suggested
137 valid_topics.append(normalized)
139 if len(valid_topics) >= max_topics:
140 break
142 # Don't add default topics - show what actually happened
143 if not valid_topics:
144 valid_topics = ["[No valid topics]"]
146 return valid_topics