package enrich import ( "sort" "strings" "unicode" ) // KeywordExtractor extracts keywords from text type KeywordExtractor struct{} // NewKeywordExtractor creates a new KeywordExtractor func NewKeywordExtractor() *KeywordExtractor { return &KeywordExtractor{} } // Extract extracts keywords from text and returns them func (e *KeywordExtractor) Extract(text Text) ([]Keyword, error) { // This is a simplified implementation // In a real-world scenario, you would use a library like github.com/jdkato/prose // or call an external API for keyword extraction content := strings.ToLower(text.Body) // Split into words words := strings.FieldsFunc(content, func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsNumber(r) }) // Count word frequencies wordFreq := make(map[string]int) for _, word := range words { if len(word) > 2 { // Skip very short words wordFreq[word]++ } } // Filter out stop words for word := range wordFreq { if isStopWord(word) { delete(wordFreq, word) } } // Convert to keywords keywords := make([]Keyword, 0, len(wordFreq)) totalWords := len(words) for word, count := range wordFreq { // Calculate relevance based on frequency relevance := float64(count) / float64(totalWords) // Boost longer words slightly relevance *= (1.0 + float64(len(word))/20.0) keywords = append(keywords, Keyword{ Text: word, Relevance: relevance, }) } // Sort by relevance sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance }) // Limit to top keywords maxKeywords := 20 if len(keywords) > maxKeywords { keywords = keywords[:maxKeywords] } return keywords, nil } // isStopWord checks if a word is a common stop word func isStopWord(word string) bool { stopWords := map[string]bool{ "a": true, "about": true, "above": true, "after": true, "again": true, "against": true, "all": true, "am": true, "an": true, "and": true, "any": true, "are": true, "as": true, "at": true, "be": true, "because": true, "been": true, "before": true, "being": true, "below": true, "between": true, "both": true, "but": true, "by": true, "can": true, "did": true, "do": true, "does": true, "doing": true, "don": true, "down": true, "during": true, "each": true, "few": true, "for": true, "from": true, "further": true, "had": true, "has": true, "have": true, "having": true, "he": true, "her": true, "here": true, "hers": true, "herself": true, "him": true, "himself": true, "his": true, "how": true, "i": true, "if": true, "in": true, "into": true, "is": true, "it": true, "its": true, "itself": true, "just": true, "me": true, "more": true, "most": true, "my": true, "myself": true, "no": true, "nor": true, "not": true, "now": true, "of": true, "off": true, "on": true, "once": true, "only": true, "or": true, "other": true, "our": true, "ours": true, "ourselves": true, "out": true, "over": true, "own": true, "same": true, "she": true, "should": true, "so": true, "some": true, "such": true, "than": true, "that": true, "the": true, "their": true, "theirs": true, "them": true, "themselves": true, "then": true, "there": true, "these": true, "they": true, "this": true, "those": true, "through": true, "to": true, "too": true, "under": true, "until": true, "up": true, "very": true, "was": true, "we": true, "were": true, "what": true, "when": true, "where": true, "which": true, "while": true, "who": true, "whom": true, "why": true, "will": true, "with": true, "would": true, "you": true, "your": true, "yours": true, "yourself": true, "yourselves": true, } return stopWords[word] }