tercul-backend/internal/jobs/linguistics/keyword_extractor.go
Damir Mukimov d50722dad5
Some checks failed
Test / Integration Tests (push) Successful in 4s
Build / Build Binary (push) Failing after 2m9s
Docker Build / Build Docker Image (push) Failing after 2m32s
Test / Unit Tests (push) Failing after 3m12s
Lint / Go Lint (push) Failing after 1m0s
Refactor ID handling to use UUIDs across the application
- Updated database models and repositories to replace uint IDs with UUIDs.
- Modified test fixtures to generate and use UUIDs for authors, translations, users, and works.
- Adjusted mock implementations to align with the new UUID structure.
- Ensured all relevant functions and methods are updated to handle UUIDs correctly.
- Added necessary imports for UUID handling in various files.
2025-12-27 00:33:34 +01:00

74 lines
1.7 KiB
Go

package linguistics
import (
"sort"
"strings"
"unicode"
)
// KeywordExtractor extracts keywords from text
type KeywordExtractor struct{}
// NewKeywordExtractor creates a new KeywordExtractor
func NewKeywordExtractor() *KeywordExtractor {
return &KeywordExtractor{}
}
// Extract extracts keywords from text and returns them
func (e *KeywordExtractor) Extract(text Text) ([]Keyword, error) {
// This is a simplified implementation
// In a real-world scenario, you would use a library like github.com/jdkato/prose
// or call an external API for keyword extraction
content := strings.ToLower(text.Body)
// Split into words
words := strings.FieldsFunc(content, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
// Count word frequencies
wordFreq := make(map[string]int)
for _, word := range words {
if len(word) > 2 { // Skip very short words
wordFreq[word]++
}
}
// Filter out stop words
for word := range wordFreq {
if isStopWord(word, text.Language) {
delete(wordFreq, word)
}
}
// Convert to keywords
keywords := make([]Keyword, 0, len(wordFreq))
totalWords := len(words)
for word, count := range wordFreq {
// Calculate relevance based on frequency
relevance := float64(count) / float64(totalWords)
// Boost longer words slightly
relevance *= (1.0 + float64(len(word))/20.0)
keywords = append(keywords, Keyword{
Text: word,
Relevance: relevance,
})
}
// Sort by relevance
sort.Slice(keywords, func(i, j int) bool {
return keywords[i].Relevance > keywords[j].Relevance
})
// Limit to top keywords
maxKeywords := 20
if len(keywords) > maxKeywords {
keywords = keywords[:maxKeywords]
}
return keywords, nil
}