package linguistics import ( "sort" "strings" "unicode" ) // Precomputed lexical resources for fast lookups var ( stopWordsEN = map[string]struct{}{ "the": {}, "a": {}, "an": {}, "and": {}, "or": {}, "but": {}, "in": {}, "on": {}, "at": {}, "to": {}, "for": {}, "of": {}, "with": {}, "by": {}, "is": {}, "are": {}, "was": {}, "were": {}, "be": {}, "been": {}, "being": {}, "have": {}, "has": {}, "had": {}, "do": {}, "does": {}, "did": {}, "will": {}, "would": {}, "could": {}, "should": {}, "may": {}, "might": {}, "can": {}, "this": {}, "that": {}, "these": {}, "those": {}, "i": {}, "you": {}, "he": {}, "she": {}, "it": {}, "we": {}, "they": {}, "me": {}, "him": {}, "hers": {}, "over": {}, "us": {}, "them": {}, "my": {}, "your": {}, "his": {}, "its": {}, "our": {}, "their": {}, } positiveEN = map[string]struct{}{ "good": {}, "great": {}, "excellent": {}, "amazing": {}, "wonderful": {}, "beautiful": {}, "love": {}, "happy": {}, "joy": {}, "success": {}, "win": {}, "winning": {}, "best": {}, "perfect": {}, "fantastic": {}, "brilliant": {}, "outstanding": {}, "superb": {}, "magnificent": {}, "delightful": {}, "pleasure": {}, "enjoy": {}, "enjoyable": {}, } negativeEN = map[string]struct{}{ "bad": {}, "terrible": {}, "awful": {}, "horrible": {}, "disgusting": {}, "hate": {}, "sad": {}, "angry": {}, "furious": {}, "disappointed": {}, "fail": {}, "failure": {}, "lose": {}, "losing": {}, "worst": {}, "dreadful": {}, "miserable": {}, "painful": {}, "annoying": {}, "frustrating": {}, "upset": {}, "depressed": {}, } ) // analyzeTextBasicStats performs basic text statistics analysis func analyzeTextBasicStats(text string) (words, sentences, paragraphs int, avgWordLength float64) { if text == "" { return 0, 0, 0, 0 } words, sentences, paragraphs, totalWordLen := scanTextStats(text) if words > 0 { avgWordLength = float64(totalWordLen) / float64(words) } return words, sentences, paragraphs, avgWordLength } // analyzeChunkBasicStats performs basic statistics on a text chunk func analyzeChunkBasicStats(chunk string) (words, sentences, paragraphs int, wordLengthSum float64, wordCount int) { if chunk == "" { return 0, 0, 0, 0, 0 } words, sentences, paragraphs, totalWordLen := scanTextStats(chunk) return words, sentences, paragraphs, float64(totalWordLen), words } // scanTextStats is a shared core that scans text and returns words, sentences, paragraphs and totalWordLen func scanTextStats[T ~string](text T) (words int, sentences int, paragraphs int, totalWordLen int) { if len(text) == 0 { return 0, 0, 0, 0 } inWord := false wordLen := 0 words = 0 sentences = 0 paragraphs = 1 prevWasNewline := false for _, r := range text { if r == '\n' { if prevWasNewline { paragraphs++ prevWasNewline = false } else { prevWasNewline = true } } else { prevWasNewline = false } if r == '.' || r == '!' || r == '?' { sentences++ } if unicode.IsLetter(r) || unicode.IsNumber(r) { inWord = true wordLen++ } else if inWord { words++ totalWordLen += wordLen inWord = false wordLen = 0 } } if inWord { words++ totalWordLen += wordLen } return } // splitTextIntoChunks splits text into chunks for concurrent processing func splitTextIntoChunks(text string, numChunks int) []string { if numChunks <= 1 || text == "" { return []string{text} } // Split by sentences to avoid breaking words sentences := strings.FieldsFunc(text, func(r rune) bool { return r == '.' || r == '!' || r == '?' }) if len(sentences) == 0 { return []string{text} } if numChunks > len(sentences) { numChunks = len(sentences) } chunks := make([]string, numChunks) chunkSize := len(sentences) / numChunks remainder := len(sentences) % numChunks start := 0 for i := 0; i < numChunks; i++ { end := start + chunkSize if i < remainder { end++ } if end > len(sentences) { end = len(sentences) } chunks[i] = strings.Join(sentences[start:end], ". ") start = end } return chunks } // calculateReadabilityScore calculates a simplified Flesch-Kincaid readability score func calculateReadabilityScore(avgSentenceLength, avgWordLength float64) float64 { // Simplified Flesch-Kincaid formula // Original: 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words) // Simplified: 206.835 - 1.015 × avgSentenceLength - 84.6 × avgWordLength score := 206.835 - 1.015*avgSentenceLength - 84.6*avgWordLength // Clamp to reasonable range if score < 0 { score = 0 } else if score > 100 { score = 100 } return score } // extractKeywordsOptimized extracts keywords from text using a simplified approach func extractKeywordsOptimized(text, language string) []Keyword { if text == "" { return []Keyword{} } tokens := tokenizeWords(text) if len(tokens) == 0 { return []Keyword{} } wordFreq := make(map[string]int, len(tokens)) for _, tok := range tokens { if len(tok) > 2 && !isStopWord(tok, language) { wordFreq[tok]++ } } total := len(tokens) keywords := make([]Keyword, 0, len(wordFreq)) for word, freq := range wordFreq { relevance := float64(freq) / float64(total) if relevance > 0.01 { keywords = append(keywords, Keyword{Text: word, Relevance: relevance}) } } sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance }) if len(keywords) > 10 { keywords = keywords[:10] } return keywords } // estimateSentimentOptimized estimates sentiment using a simplified approach func estimateSentimentOptimized(text, language string) float64 { if text == "" { return 0 } tokens := tokenizeWords(text) positiveCount := 0 negativeCount := 0 for _, tok := range tokens { if isPositiveWord(tok, language) { positiveCount++ } else if isNegativeWord(tok, language) { negativeCount++ } } total := positiveCount + negativeCount if total == 0 { return 0 } // Return sentiment score between -1 and 1 return float64(positiveCount-negativeCount) / float64(total) } // isStopWord checks if a word is a common stop word func isStopWord(word, language string) bool { switch language { case "en": _, ok := stopWordsEN[word] return ok default: _, ok := stopWordsEN[word] return ok } } // isPositiveWord checks if a word is positive func isPositiveWord(word, language string) bool { switch language { case "en": _, ok := positiveEN[word] return ok default: _, ok := positiveEN[word] return ok } } // isNegativeWord checks if a word is negative func isNegativeWord(word, language string) bool { switch language { case "en": _, ok := negativeEN[word] return ok default: _, ok := negativeEN[word] return ok } } // tokenizeWords returns lowercase alphanumeric tokens using a single pass func tokenizeWords(text string) []string { if text == "" { return nil } tokens := make([]string, 0, 256) var b strings.Builder b.Grow(32) for _, r := range text { lr := unicode.ToLower(r) if unicode.IsLetter(lr) || unicode.IsNumber(lr) { b.WriteRune(lr) } else if b.Len() > 0 { tokens = append(tokens, b.String()) b.Reset() } } if b.Len() > 0 { tokens = append(tokens, b.String()) } return tokens }