tercul-backend/internal/jobs/linguistics/keyword_extractor.go
google-labs-jules[bot] 8797cec718 Refactor: In-progress refactoring to fix build.
This commit includes the following changes:
- Refactored all data repositories in `internal/data/sql/` to use a consistent `sql` package and to align with the new `domain` models.
- Fixed the GraphQL structure by moving the server creation logic from `internal/app` to `cmd/api`, which resolved an import cycle.
- Corrected numerous incorrect import paths for packages like `graph`, `linguistics`, `syncjob`, and the legacy `models` package.
- Resolved several package and function redeclaration errors.
- Removed legacy migration code.
2025-09-05 15:11:30 +00:00

75 lines
1.7 KiB
Go

package linguistics
import (
"sort"
"strings"
"unicode"
)
// KeywordExtractor extracts keywords from text
type KeywordExtractor struct{}
// NewKeywordExtractor creates a new KeywordExtractor
func NewKeywordExtractor() *KeywordExtractor {
return &KeywordExtractor{}
}
// Extract extracts keywords from text and returns them
func (e *KeywordExtractor) Extract(text Text) ([]Keyword, error) {
// This is a simplified implementation
// In a real-world scenario, you would use a library like github.com/jdkato/prose
// or call an external API for keyword extraction
content := strings.ToLower(text.Body)
// Split into words
words := strings.FieldsFunc(content, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
// Count word frequencies
wordFreq := make(map[string]int)
for _, word := range words {
if len(word) > 2 { // Skip very short words
wordFreq[word]++
}
}
// Filter out stop words
for word := range wordFreq {
if isStopWord(word, text.Language) {
delete(wordFreq, word)
}
}
// Convert to keywords
keywords := make([]Keyword, 0, len(wordFreq))
totalWords := len(words)
for word, count := range wordFreq {
// Calculate relevance based on frequency
relevance := float64(count) / float64(totalWords)
// Boost longer words slightly
relevance *= (1.0 + float64(len(word))/20.0)
keywords = append(keywords, Keyword{
Text: word,
Relevance: relevance,
})
}
// Sort by relevance
sort.Slice(keywords, func(i, j int) bool {
return keywords[i].Relevance > keywords[j].Relevance
})
// Limit to top keywords
maxKeywords := 20
if len(keywords) > maxKeywords {
keywords = keywords[:maxKeywords]
}
return keywords, nil
}