mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 00:31:35 +00:00
108 lines
3.6 KiB
Go
108 lines
3.6 KiB
Go
package linguistics
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// KeywordExtractor extracts keywords from text
|
|
type KeywordExtractor struct{}
|
|
|
|
// NewKeywordExtractor creates a new KeywordExtractor
|
|
func NewKeywordExtractor() *KeywordExtractor {
|
|
return &KeywordExtractor{}
|
|
}
|
|
|
|
// Extract extracts keywords from text and returns them
|
|
func (e *KeywordExtractor) Extract(text Text) ([]Keyword, error) {
|
|
// This is a simplified implementation
|
|
// In a real-world scenario, you would use a library like github.com/jdkato/prose
|
|
// or call an external API for keyword extraction
|
|
|
|
content := strings.ToLower(text.Body)
|
|
|
|
// Split into words
|
|
words := strings.FieldsFunc(content, func(r rune) bool {
|
|
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
|
|
})
|
|
|
|
// Count word frequencies
|
|
wordFreq := make(map[string]int)
|
|
for _, word := range words {
|
|
if len(word) > 2 { // Skip very short words
|
|
wordFreq[word]++
|
|
}
|
|
}
|
|
|
|
// Filter out stop words
|
|
for word := range wordFreq {
|
|
if isStopWord(word) {
|
|
delete(wordFreq, word)
|
|
}
|
|
}
|
|
|
|
// Convert to keywords
|
|
keywords := make([]Keyword, 0, len(wordFreq))
|
|
totalWords := len(words)
|
|
for word, count := range wordFreq {
|
|
// Calculate relevance based on frequency
|
|
relevance := float64(count) / float64(totalWords)
|
|
|
|
// Boost longer words slightly
|
|
relevance *= (1.0 + float64(len(word))/20.0)
|
|
|
|
keywords = append(keywords, Keyword{
|
|
Text: word,
|
|
Relevance: relevance,
|
|
})
|
|
}
|
|
|
|
// Sort by relevance
|
|
sort.Slice(keywords, func(i, j int) bool {
|
|
return keywords[i].Relevance > keywords[j].Relevance
|
|
})
|
|
|
|
// Limit to top keywords
|
|
maxKeywords := 20
|
|
if len(keywords) > maxKeywords {
|
|
keywords = keywords[:maxKeywords]
|
|
}
|
|
|
|
return keywords, nil
|
|
}
|
|
|
|
// isStopWord checks if a word is a common stop word
|
|
func isStopWord(word string) bool {
|
|
stopWords := map[string]bool{
|
|
"a": true, "about": true, "above": true, "after": true, "again": true,
|
|
"against": true, "all": true, "am": true, "an": true, "and": true,
|
|
"any": true, "are": true, "as": true, "at": true, "be": true,
|
|
"because": true, "been": true, "before": true, "being": true, "below": true,
|
|
"between": true, "both": true, "but": true, "by": true, "can": true,
|
|
"did": true, "do": true, "does": true, "doing": true, "don": true,
|
|
"down": true, "during": true, "each": true, "few": true, "for": true,
|
|
"from": true, "further": true, "had": true, "has": true, "have": true,
|
|
"having": true, "he": true, "her": true, "here": true, "hers": true,
|
|
"herself": true, "him": true, "himself": true, "his": true, "how": true,
|
|
"i": true, "if": true, "in": true, "into": true, "is": true,
|
|
"it": true, "its": true, "itself": true, "just": true, "me": true,
|
|
"more": true, "most": true, "my": true, "myself": true, "no": true,
|
|
"nor": true, "not": true, "now": true, "of": true, "off": true,
|
|
"on": true, "once": true, "only": true, "or": true, "other": true,
|
|
"our": true, "ours": true, "ourselves": true, "out": true, "over": true,
|
|
"own": true, "same": true, "she": true, "should": true, "so": true,
|
|
"some": true, "such": true, "than": true, "that": true, "the": true,
|
|
"their": true, "theirs": true, "them": true, "themselves": true, "then": true,
|
|
"there": true, "these": true, "they": true, "this": true, "those": true,
|
|
"through": true, "to": true, "too": true, "under": true, "until": true,
|
|
"up": true, "very": true, "was": true, "we": true, "were": true,
|
|
"what": true, "when": true, "where": true, "which": true, "while": true,
|
|
"who": true, "whom": true, "why": true, "will": true, "with": true,
|
|
"would": true, "you": true, "your": true, "yours": true, "yourself": true,
|
|
"yourselves": true,
|
|
}
|
|
|
|
return stopWords[word]
|
|
}
|