mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
- Core Go application with GraphQL API using gqlgen - Comprehensive data models for literary works, authors, translations - Repository pattern with caching layer - Authentication and authorization system - Linguistics analysis capabilities with multiple adapters - Vector search integration with Weaviate - Docker containerization support - Python data migration and analysis scripts - Clean architecture with proper separation of concerns - Production-ready configuration and middleware - Proper .gitignore excluding vendor/, database files, and build artifacts
46 lines
1.3 KiB
Go
46 lines
1.3 KiB
Go
package linguistics
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// TFIDFKeywordProvider is a lightweight keyword provider using local term frequencies as a proxy.
|
|
// Note: A full TF-IDF requires a corpus. This implementation uses per-text frequency weighting
|
|
// with stopword filtering and length thresholds to approximate keyword relevance without extra state.
|
|
type TFIDFKeywordProvider struct{}
|
|
|
|
func NewTFIDFKeywordProvider() *TFIDFKeywordProvider { return &TFIDFKeywordProvider{} }
|
|
|
|
func (p *TFIDFKeywordProvider) Extract(text string, language string) ([]Keyword, error) {
|
|
tokens := tokenizeWords(text)
|
|
if len(tokens) == 0 {
|
|
return nil, nil
|
|
}
|
|
freq := make(map[string]int, len(tokens))
|
|
for _, t := range tokens {
|
|
if len(t) <= 2 || isStopWord(t, language) {
|
|
continue
|
|
}
|
|
freq[strings.ToLower(t)]++
|
|
}
|
|
total := 0
|
|
for _, c := range freq {
|
|
total += c
|
|
}
|
|
keywords := make([]Keyword, 0, len(freq))
|
|
for w, c := range freq {
|
|
rel := float64(c) / float64(len(tokens))
|
|
if rel > 0 {
|
|
keywords = append(keywords, Keyword{Text: w, Relevance: rel})
|
|
}
|
|
}
|
|
sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
|
|
if len(keywords) > 10 {
|
|
keywords = keywords[:10]
|
|
}
|
|
return keywords, nil
|
|
}
|
|
|
|
|