tercul-backend/internal/jobs/linguistics/text_utils.go

package linguistics

import (
	"sort"
	"strings"
	"unicode"
)

// Precomputed lexical resources for fast lookups
var (
	stopWordsEN = map[string]struct{}{
		"the": {}, "a": {}, "an": {}, "and": {}, "or": {}, "but": {},
		"in": {}, "on": {}, "at": {}, "to": {}, "for": {}, "of": {},
		"with": {}, "by": {}, "is": {}, "are": {}, "was": {}, "were": {},
		"be": {}, "been": {}, "being": {}, "have": {}, "has": {}, "had": {},
		"do": {}, "does": {}, "did": {}, "will": {}, "would": {}, "could": {},
		"should": {}, "may": {}, "might": {}, "can": {}, "this": {}, "that": {},
		"these": {}, "those": {}, "i": {}, "you": {}, "he": {}, "she": {},
		"it": {}, "we": {}, "they": {}, "me": {}, "him": {}, "hers": {}, "over": {},
		"us": {}, "them": {}, "my": {}, "your": {}, "his": {}, "its": {},
		"our": {}, "their": {},
	}

	positiveEN = map[string]struct{}{
		"good": {}, "great": {}, "excellent": {}, "amazing": {}, "wonderful": {},
		"beautiful": {}, "love": {}, "happy": {}, "joy": {}, "success": {},
		"win": {}, "winning": {}, "best": {}, "perfect": {}, "fantastic": {},
		"brilliant": {}, "outstanding": {}, "superb": {}, "magnificent": {},
		"delightful": {}, "pleasure": {}, "enjoy": {}, "enjoyable": {},
	}

	negativeEN = map[string]struct{}{
		"bad": {}, "terrible": {}, "awful": {}, "horrible": {}, "disgusting": {},
		"hate": {}, "sad": {}, "angry": {}, "furious": {}, "disappointed": {},
		"fail": {}, "failure": {}, "lose": {}, "losing": {}, "worst": {},
		"dreadful": {}, "miserable": {}, "painful": {},
		"annoying": {}, "frustrating": {}, "upset": {}, "depressed": {},
	}
)

// analyzeTextBasicStats performs basic text statistics analysis
func analyzeTextBasicStats(text string) (words, sentences, paragraphs int, avgWordLength float64) {
	if text == "" {
		return 0, 0, 0, 0
	}

	words, sentences, paragraphs, totalWordLen := scanTextStats(text)
	if words > 0 {
		avgWordLength = float64(totalWordLen) / float64(words)
	}
	return words, sentences, paragraphs, avgWordLength
}

// analyzeChunkBasicStats performs basic statistics on a text chunk
func analyzeChunkBasicStats(chunk string) (words, sentences, paragraphs int, wordLengthSum float64, wordCount int) {
	if chunk == "" {
		return 0, 0, 0, 0, 0
	}

	words, sentences, paragraphs, totalWordLen := scanTextStats(chunk)
	return words, sentences, paragraphs, float64(totalWordLen), words
}

// scanTextStats is a shared core that scans text and returns words, sentences, paragraphs and totalWordLen
func scanTextStats[T ~string](text T) (words int, sentences int, paragraphs int, totalWordLen int) {
	if len(text) == 0 {
		return 0, 0, 0, 0
	}
	inWord := false
	wordLen := 0
	words = 0
	sentences = 0
	paragraphs = 1
	prevWasNewline := false
	for _, r := range text {
		if r == '\n' {
			if prevWasNewline {
				paragraphs++
				prevWasNewline = false
			} else {
				prevWasNewline = true
			}
		} else {
			prevWasNewline = false
		}
		if r == '.' || r == '!' || r == '?' {
			sentences++
		}
		if unicode.IsLetter(r) || unicode.IsNumber(r) {
			inWord = true
			wordLen++
		} else if inWord {
			words++
			totalWordLen += wordLen
			inWord = false
			wordLen = 0
		}
	}
	if inWord {
		words++
		totalWordLen += wordLen
	}
	return words, sentences, paragraphs, totalWordLen
}

// splitTextIntoChunks splits text into chunks for concurrent processing
func splitTextIntoChunks(text string, numChunks int) []string {
	if numChunks <= 1 || text == "" {
		return []string{text}
	}

	// Split by sentences to avoid breaking words
	sentences := strings.FieldsFunc(text, func(r rune) bool {
		return r == '.' || r == '!' || r == '?'
	})

	if len(sentences) == 0 {
		return []string{text}
	}

	if numChunks > len(sentences) {
		numChunks = len(sentences)
	}
	chunks := make([]string, numChunks)
	chunkSize := len(sentences) / numChunks
	remainder := len(sentences) % numChunks

	start := 0
	for i := 0; i < numChunks; i++ {
		end := start + chunkSize
		if i < remainder {
			end++
		}

		if end > len(sentences) {
			end = len(sentences)
		}

		chunks[i] = strings.Join(sentences[start:end], ". ")
		start = end
	}

	return chunks
}

// calculateReadabilityScore calculates a simplified Flesch-Kincaid readability score
func calculateReadabilityScore(avgSentenceLength, avgWordLength float64) float64 {
	// Simplified Flesch-Kincaid formula
	// Original: 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words)
	// Simplified: 206.835 - 1.015 × avgSentenceLength - 84.6 × avgWordLength
	score := 206.835 - 1.015*avgSentenceLength - 84.6*avgWordLength

	// Clamp to reasonable range
	if score < 0 {
		score = 0
	} else if score > 100 {
		score = 100
	}

	return score
}

// extractKeywordsOptimized extracts keywords from text using a simplified approach
func extractKeywordsOptimized(text, language string) []Keyword {
	if text == "" {
		return []Keyword{}
	}

	tokens := tokenizeWords(text)
	if len(tokens) == 0 {
		return []Keyword{}
	}

	wordFreq := make(map[string]int, len(tokens))
	for _, tok := range tokens {
		if len(tok) > 2 && !isStopWord(tok, language) {
			wordFreq[tok]++
		}
	}

	total := len(tokens)
	keywords := make([]Keyword, 0, len(wordFreq))
	for word, freq := range wordFreq {
		relevance := float64(freq) / float64(total)
		if relevance > 0.01 {
			keywords = append(keywords, Keyword{Text: word, Relevance: relevance})
		}
	}

	sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
	if len(keywords) > 10 {
		keywords = keywords[:10]
	}

	return keywords
}

// estimateSentimentOptimized estimates sentiment using a simplified approach
func estimateSentimentOptimized(text, language string) float64 {
	if text == "" {
		return 0
	}

	tokens := tokenizeWords(text)
	positiveCount := 0
	negativeCount := 0
	for _, tok := range tokens {
		if isPositiveWord(tok, language) {
			positiveCount++
		} else if isNegativeWord(tok, language) {
			negativeCount++
		}
	}

	total := positiveCount + negativeCount
	if total == 0 {
		return 0
	}

	// Return sentiment score between -1 and 1
	return float64(positiveCount-negativeCount) / float64(total)
}

// isStopWord checks if a word is a common stop word
func isStopWord(word, language string) bool {
	switch language {
	case "en":
		_, ok := stopWordsEN[word]
		return ok
	default:
		_, ok := stopWordsEN[word]
		return ok
	}
}

// isPositiveWord checks if a word is positive
func isPositiveWord(word, language string) bool {
	switch language {
	case "en":
		_, ok := positiveEN[word]
		return ok
	default:
		_, ok := positiveEN[word]
		return ok
	}
}

// isNegativeWord checks if a word is negative
func isNegativeWord(word, language string) bool {
	switch language {
	case "en":
		_, ok := negativeEN[word]
		return ok
	default:
		_, ok := negativeEN[word]
		return ok
	}
}

// tokenizeWords returns lowercase alphanumeric tokens using a single pass
func tokenizeWords(text string) []string {
	if text == "" {
		return nil
	}
	tokens := make([]string, 0, 256)
	var b strings.Builder
	b.Grow(32)
	for _, r := range text {
		lr := unicode.ToLower(r)
		if unicode.IsLetter(lr) || unicode.IsNumber(lr) {
			b.WriteRune(lr)
		} else if b.Len() > 0 {
			tokens = append(tokens, b.String())
			b.Reset()
		}
	}
	if b.Len() > 0 {
		tokens = append(tokens, b.String())
	}
	return tokens
}