tercul-backend/internal/jobs/linguistics/adapter_tfidf.go

44 lines
1.2 KiB
Go

package linguistics
import (
"sort"
"strings"
)
// TFIDFKeywordProvider is a lightweight keyword provider using local term frequencies as a proxy.
// Note: A full TF-IDF requires a corpus. This implementation uses per-text frequency weighting
// with stopword filtering and length thresholds to approximate keyword relevance without extra state.
type TFIDFKeywordProvider struct{}
func NewTFIDFKeywordProvider() *TFIDFKeywordProvider { return &TFIDFKeywordProvider{} }
func (p *TFIDFKeywordProvider) Extract(text string, language string) ([]Keyword, error) {
tokens := tokenizeWords(text)
if len(tokens) == 0 {
return nil, nil
}
freq := make(map[string]int, len(tokens))
for _, t := range tokens {
if len(t) <= 2 || isStopWord(t, language) {
continue
}
freq[strings.ToLower(t)]++
}
total := 0
for _, c := range freq {
total += c
}
keywords := make([]Keyword, 0, len(freq))
for w, c := range freq {
rel := float64(c) / float64(len(tokens))
if rel > 0 {
keywords = append(keywords, Keyword{Text: w, Relevance: rel})
}
}
sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
if len(keywords) > 10 {
keywords = keywords[:10]
}
return keywords, nil
}