mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
This commit includes the following changes: - Refactored all data repositories in `internal/data/sql/` to use a consistent `sql` package and to align with the new `domain` models. - Fixed the GraphQL structure by moving the server creation logic from `internal/app` to `cmd/api`, which resolved an import cycle. - Corrected numerous incorrect import paths for packages like `graph`, `linguistics`, `syncjob`, and the legacy `models` package. - Resolved several package and function redeclaration errors. - Removed legacy migration code.
75 lines
1.7 KiB
Go
75 lines
1.7 KiB
Go
package linguistics
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// KeywordExtractor extracts keywords from text
|
|
type KeywordExtractor struct{}
|
|
|
|
// NewKeywordExtractor creates a new KeywordExtractor
|
|
func NewKeywordExtractor() *KeywordExtractor {
|
|
return &KeywordExtractor{}
|
|
}
|
|
|
|
// Extract extracts keywords from text and returns them
|
|
func (e *KeywordExtractor) Extract(text Text) ([]Keyword, error) {
|
|
// This is a simplified implementation
|
|
// In a real-world scenario, you would use a library like github.com/jdkato/prose
|
|
// or call an external API for keyword extraction
|
|
|
|
content := strings.ToLower(text.Body)
|
|
|
|
// Split into words
|
|
words := strings.FieldsFunc(content, func(r rune) bool {
|
|
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
|
|
})
|
|
|
|
// Count word frequencies
|
|
wordFreq := make(map[string]int)
|
|
for _, word := range words {
|
|
if len(word) > 2 { // Skip very short words
|
|
wordFreq[word]++
|
|
}
|
|
}
|
|
|
|
// Filter out stop words
|
|
for word := range wordFreq {
|
|
if isStopWord(word, text.Language) {
|
|
delete(wordFreq, word)
|
|
}
|
|
}
|
|
|
|
// Convert to keywords
|
|
keywords := make([]Keyword, 0, len(wordFreq))
|
|
totalWords := len(words)
|
|
for word, count := range wordFreq {
|
|
// Calculate relevance based on frequency
|
|
relevance := float64(count) / float64(totalWords)
|
|
|
|
// Boost longer words slightly
|
|
relevance *= (1.0 + float64(len(word))/20.0)
|
|
|
|
keywords = append(keywords, Keyword{
|
|
Text: word,
|
|
Relevance: relevance,
|
|
})
|
|
}
|
|
|
|
// Sort by relevance
|
|
sort.Slice(keywords, func(i, j int) bool {
|
|
return keywords[i].Relevance > keywords[j].Relevance
|
|
})
|
|
|
|
// Limit to top keywords
|
|
maxKeywords := 20
|
|
if len(keywords) > maxKeywords {
|
|
keywords = keywords[:maxKeywords]
|
|
}
|
|
|
|
return keywords, nil
|
|
}
|
|
|