tercul-backend/internal/jobs/linguistics/keyword_extractor.go

package linguistics

import (
	"sort"
	"strings"
	"unicode"
)

// KeywordExtractor extracts keywords from text
type KeywordExtractor struct{}

// NewKeywordExtractor creates a new KeywordExtractor
func NewKeywordExtractor() *KeywordExtractor {
	return &KeywordExtractor{}
}

// Extract extracts keywords from text and returns them
func (e *KeywordExtractor) Extract(text Text) ([]Keyword, error) {
	// This is a simplified implementation
	// In a real-world scenario, you would use a library like github.com/jdkato/prose
	// or call an external API for keyword extraction

	content := strings.ToLower(text.Body)

	// Split into words
	words := strings.FieldsFunc(content, func(r rune) bool {
		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
	})

	// Count word frequencies
	wordFreq := make(map[string]int)
	for _, word := range words {
		if len(word) > 2 { // Skip very short words
			wordFreq[word]++
		}
	}

	// Filter out stop words
	for word := range wordFreq {
		if isStopWord(word, text.Language) {
			delete(wordFreq, word)
		}
	}

	// Convert to keywords
	keywords := make([]Keyword, 0, len(wordFreq))
	totalWords := len(words)
	for word, count := range wordFreq {
		// Calculate relevance based on frequency
		relevance := float64(count) / float64(totalWords)

		// Boost longer words slightly
		relevance *= (1.0 + float64(len(word))/20.0)

		keywords = append(keywords, Keyword{
			Text:      word,
			Relevance: relevance,
		})
	}

	// Sort by relevance
	sort.Slice(keywords, func(i, j int) bool {
		return keywords[i].Relevance > keywords[j].Relevance
	})

	// Limit to top keywords
	maxKeywords := 20
	if len(keywords) > maxKeywords {
		keywords = keywords[:maxKeywords]
	}

	return keywords, nil
}