tercul-backend/linguistics/text_utils.go

package linguistics

import (
    "strings"
    "unicode"
    "sort"
)

// Precomputed lexical resources for fast lookups
var (
    stopWordsEN = map[string]struct{}{
        "the": {}, "a": {}, "an": {}, "and": {}, "or": {}, "but": {},
        "in": {}, "on": {}, "at": {}, "to": {}, "for": {}, "of": {},
        "with": {}, "by": {}, "is": {}, "are": {}, "was": {}, "were": {},
        "be": {}, "been": {}, "being": {}, "have": {}, "has": {}, "had": {},
        "do": {}, "does": {}, "did": {}, "will": {}, "would": {}, "could": {},
        "should": {}, "may": {}, "might": {}, "can": {}, "this": {}, "that": {},
        "these": {}, "those": {}, "i": {}, "you": {}, "he": {}, "she": {},
        "it": {}, "we": {}, "they": {}, "me": {}, "him": {}, "hers": {},
        "us": {}, "them": {}, "my": {}, "your": {}, "his": {}, "its": {},
        "our": {}, "their": {},
    }

    positiveEN = map[string]struct{}{
        "good": {}, "great": {}, "excellent": {}, "amazing": {}, "wonderful": {},
        "beautiful": {}, "love": {}, "happy": {}, "joy": {}, "success": {},
        "win": {}, "winning": {}, "best": {}, "perfect": {}, "fantastic": {},
        "brilliant": {}, "outstanding": {}, "superb": {}, "magnificent": {},
        "delightful": {}, "pleasure": {}, "enjoy": {}, "enjoyable": {},
    }

    negativeEN = map[string]struct{}{
        "bad": {}, "terrible": {}, "awful": {}, "horrible": {}, "disgusting": {},
        "hate": {}, "sad": {}, "angry": {}, "furious": {}, "disappointed": {},
        "fail": {}, "failure": {}, "lose": {}, "losing": {}, "worst": {},
        "dreadful": {}, "miserable": {}, "painful": {},
        "annoying": {}, "frustrating": {}, "upset": {}, "depressed": {},
    }
)

// analyzeTextBasicStats performs basic text statistics analysis
func analyzeTextBasicStats(text string) (words, sentences, paragraphs int, avgWordLength float64) {
	if text == "" {
		return 0, 0, 0, 0
	}

    // Single pass scanner over runes
    inWord := false
    wordLen := 0
    totalWordLen := 0
    words = 0
    sentences = 0
    paragraphs = 1

    prevWasNewline := false

    for _, r := range text {
        // Paragraphs: count double newline boundaries
        if r == '\n' {
            if prevWasNewline {
                paragraphs++
                prevWasNewline = false // avoid counting more than once for >2 newlines
            } else {
                prevWasNewline = true
            }
        } else {
            prevWasNewline = false
        }

        // Sentences: simple heuristic on end punctuation
        if r == '.' || r == '!' || r == '?' {
            sentences++
        }

        // Words: alphanumeric sequences
        if unicode.IsLetter(r) || unicode.IsNumber(r) {
            inWord = true
            wordLen++
        } else {
            if inWord {
                words++
                totalWordLen += wordLen
                inWord = false
                wordLen = 0
            }
        }
    }

    if inWord {
        words++
        totalWordLen += wordLen
    }

    if words > 0 {
        avgWordLength = float64(totalWordLen) / float64(words)
    }

    return words, sentences, paragraphs, avgWordLength
}

// analyzeChunkBasicStats performs basic statistics on a text chunk
func analyzeChunkBasicStats(chunk string) (words, sentences, paragraphs int, wordLengthSum float64, wordCount int) {
	if chunk == "" {
		return 0, 0, 0, 0, 0
	}

    inWord := false
    wordLen := 0
    totalWordLen := 0
    words = 0
    sentences = 0
    paragraphs = 1

    prevWasNewline := false

    for _, r := range chunk {
        if r == '\n' {
            if prevWasNewline {
                paragraphs++
                prevWasNewline = false
            } else {
                prevWasNewline = true
            }
        } else {
            prevWasNewline = false
        }

        if r == '.' || r == '!' || r == '?' {
            sentences++
        }

        if unicode.IsLetter(r) || unicode.IsNumber(r) {
            inWord = true
            wordLen++
        } else {
            if inWord {
                words++
                totalWordLen += wordLen
                inWord = false
                wordLen = 0
            }
        }
    }

    if inWord {
        words++
        totalWordLen += wordLen
    }

    wordLengthSum = float64(totalWordLen)
    wordCount = words
    return words, sentences, paragraphs, wordLengthSum, wordCount
}

// splitTextIntoChunks splits text into chunks for concurrent processing
func splitTextIntoChunks(text string, numChunks int) []string {
	if numChunks <= 1 || text == "" {
		return []string{text}
	}

	// Split by sentences to avoid breaking words
	sentences := strings.FieldsFunc(text, func(r rune) bool {
		return r == '.' || r == '!' || r == '?'
	})

    if len(sentences) == 0 {
        return []string{text}
    }

    if numChunks > len(sentences) {
        numChunks = len(sentences)
    }
    chunks := make([]string, numChunks)
	chunkSize := len(sentences) / numChunks
	remainder := len(sentences) % numChunks

	start := 0
	for i := 0; i < numChunks; i++ {
		end := start + chunkSize
		if i < remainder {
			end++
		}

		if end > len(sentences) {
			end = len(sentences)
		}

		chunks[i] = strings.Join(sentences[start:end], ". ")
		start = end
	}

	return chunks
}

// calculateReadabilityScore calculates a simplified Flesch-Kincaid readability score
func calculateReadabilityScore(avgSentenceLength, avgWordLength float64) float64 {
	// Simplified Flesch-Kincaid formula
	// Original: 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words)
	// Simplified: 206.835 - 1.015 × avgSentenceLength - 84.6 × avgWordLength
	score := 206.835 - 1.015*avgSentenceLength - 84.6*avgWordLength

	// Clamp to reasonable range
	if score < 0 {
		score = 0
	} else if score > 100 {
		score = 100
	}

	return score
}

// extractKeywordsOptimized extracts keywords from text using a simplified approach
func extractKeywordsOptimized(text, language string) []Keyword {
	if text == "" {
		return []Keyword{}
	}

    tokens := tokenizeWords(text)
    if len(tokens) == 0 {
        return []Keyword{}
    }

    wordFreq := make(map[string]int, len(tokens))
    for _, tok := range tokens {
        if len(tok) > 2 && !isStopWord(tok, language) {
            wordFreq[tok]++
        }
    }

    total := len(tokens)
    keywords := make([]Keyword, 0, len(wordFreq))
    for word, freq := range wordFreq {
        relevance := float64(freq) / float64(total)
        if relevance > 0.01 {
            keywords = append(keywords, Keyword{Text: word, Relevance: relevance})
        }
    }

    sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
    if len(keywords) > 10 {
        keywords = keywords[:10]
    }

    return keywords
}

// estimateSentimentOptimized estimates sentiment using a simplified approach
func estimateSentimentOptimized(text, language string) float64 {
	if text == "" {
		return 0
	}

    tokens := tokenizeWords(text)
    positiveCount := 0
    negativeCount := 0
    for _, tok := range tokens {
        if isPositiveWord(tok, language) {
            positiveCount++
        } else if isNegativeWord(tok, language) {
            negativeCount++
        }
    }

	total := positiveCount + negativeCount
	if total == 0 {
		return 0
	}

	// Return sentiment score between -1 and 1
	return float64(positiveCount-negativeCount) / float64(total)
}

// isStopWord checks if a word is a common stop word
func isStopWord(word, language string) bool {
    switch language {
    case "en":
        _, ok := stopWordsEN[word]
        return ok
    default:
        _, ok := stopWordsEN[word]
        return ok
    }
}

// isPositiveWord checks if a word is positive
func isPositiveWord(word, language string) bool {
    switch language {
    case "en":
        _, ok := positiveEN[word]
        return ok
    default:
        _, ok := positiveEN[word]
        return ok
    }
}

// isNegativeWord checks if a word is negative
func isNegativeWord(word, language string) bool {
    switch language {
    case "en":
        _, ok := negativeEN[word]
        return ok
    default:
        _, ok := negativeEN[word]
        return ok
    }
}

// tokenizeWords returns lowercase alphanumeric tokens using a single pass
func tokenizeWords(text string) []string {
    if text == "" {
        return nil
    }
    tokens := make([]string, 0, 256)
    var b strings.Builder
    b.Grow(32)
    for _, r := range strings.ToLower(text) {
        if unicode.IsLetter(r) || unicode.IsNumber(r) {
            b.WriteRune(r)
        } else if b.Len() > 0 {
            tokens = append(tokens, b.String())
            b.Reset()
        }
    }
    if b.Len() > 0 {
        tokens = append(tokens, b.String())
    }
    return tokens
}