tercul-backend/linguistics/text_utils.go
Damir Mukimov 4957117cb6 Initial commit: Tercul Go project with comprehensive architecture
- Core Go application with GraphQL API using gqlgen
- Comprehensive data models for literary works, authors, translations
- Repository pattern with caching layer
- Authentication and authorization system
- Linguistics analysis capabilities with multiple adapters
- Vector search integration with Weaviate
- Docker containerization support
- Python data migration and analysis scripts
- Clean architecture with proper separation of concerns
- Production-ready configuration and middleware
- Proper .gitignore excluding vendor/, database files, and build artifacts
2025-08-13 07:42:32 +02:00

330 lines
8.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package linguistics
import (
"strings"
"unicode"
"sort"
)
// Precomputed lexical resources for fast lookups
var (
stopWordsEN = map[string]struct{}{
"the": {}, "a": {}, "an": {}, "and": {}, "or": {}, "but": {},
"in": {}, "on": {}, "at": {}, "to": {}, "for": {}, "of": {},
"with": {}, "by": {}, "is": {}, "are": {}, "was": {}, "were": {},
"be": {}, "been": {}, "being": {}, "have": {}, "has": {}, "had": {},
"do": {}, "does": {}, "did": {}, "will": {}, "would": {}, "could": {},
"should": {}, "may": {}, "might": {}, "can": {}, "this": {}, "that": {},
"these": {}, "those": {}, "i": {}, "you": {}, "he": {}, "she": {},
"it": {}, "we": {}, "they": {}, "me": {}, "him": {}, "hers": {},
"us": {}, "them": {}, "my": {}, "your": {}, "his": {}, "its": {},
"our": {}, "their": {},
}
positiveEN = map[string]struct{}{
"good": {}, "great": {}, "excellent": {}, "amazing": {}, "wonderful": {},
"beautiful": {}, "love": {}, "happy": {}, "joy": {}, "success": {},
"win": {}, "winning": {}, "best": {}, "perfect": {}, "fantastic": {},
"brilliant": {}, "outstanding": {}, "superb": {}, "magnificent": {},
"delightful": {}, "pleasure": {}, "enjoy": {}, "enjoyable": {},
}
negativeEN = map[string]struct{}{
"bad": {}, "terrible": {}, "awful": {}, "horrible": {}, "disgusting": {},
"hate": {}, "sad": {}, "angry": {}, "furious": {}, "disappointed": {},
"fail": {}, "failure": {}, "lose": {}, "losing": {}, "worst": {},
"dreadful": {}, "miserable": {}, "painful": {},
"annoying": {}, "frustrating": {}, "upset": {}, "depressed": {},
}
)
// analyzeTextBasicStats performs basic text statistics analysis
func analyzeTextBasicStats(text string) (words, sentences, paragraphs int, avgWordLength float64) {
if text == "" {
return 0, 0, 0, 0
}
// Single pass scanner over runes
inWord := false
wordLen := 0
totalWordLen := 0
words = 0
sentences = 0
paragraphs = 1
prevWasNewline := false
for _, r := range text {
// Paragraphs: count double newline boundaries
if r == '\n' {
if prevWasNewline {
paragraphs++
prevWasNewline = false // avoid counting more than once for >2 newlines
} else {
prevWasNewline = true
}
} else {
prevWasNewline = false
}
// Sentences: simple heuristic on end punctuation
if r == '.' || r == '!' || r == '?' {
sentences++
}
// Words: alphanumeric sequences
if unicode.IsLetter(r) || unicode.IsNumber(r) {
inWord = true
wordLen++
} else {
if inWord {
words++
totalWordLen += wordLen
inWord = false
wordLen = 0
}
}
}
if inWord {
words++
totalWordLen += wordLen
}
if words > 0 {
avgWordLength = float64(totalWordLen) / float64(words)
}
return words, sentences, paragraphs, avgWordLength
}
// analyzeChunkBasicStats performs basic statistics on a text chunk
func analyzeChunkBasicStats(chunk string) (words, sentences, paragraphs int, wordLengthSum float64, wordCount int) {
if chunk == "" {
return 0, 0, 0, 0, 0
}
inWord := false
wordLen := 0
totalWordLen := 0
words = 0
sentences = 0
paragraphs = 1
prevWasNewline := false
for _, r := range chunk {
if r == '\n' {
if prevWasNewline {
paragraphs++
prevWasNewline = false
} else {
prevWasNewline = true
}
} else {
prevWasNewline = false
}
if r == '.' || r == '!' || r == '?' {
sentences++
}
if unicode.IsLetter(r) || unicode.IsNumber(r) {
inWord = true
wordLen++
} else {
if inWord {
words++
totalWordLen += wordLen
inWord = false
wordLen = 0
}
}
}
if inWord {
words++
totalWordLen += wordLen
}
wordLengthSum = float64(totalWordLen)
wordCount = words
return words, sentences, paragraphs, wordLengthSum, wordCount
}
// splitTextIntoChunks splits text into chunks for concurrent processing
func splitTextIntoChunks(text string, numChunks int) []string {
if numChunks <= 1 || text == "" {
return []string{text}
}
// Split by sentences to avoid breaking words
sentences := strings.FieldsFunc(text, func(r rune) bool {
return r == '.' || r == '!' || r == '?'
})
if len(sentences) == 0 {
return []string{text}
}
if numChunks > len(sentences) {
numChunks = len(sentences)
}
chunks := make([]string, numChunks)
chunkSize := len(sentences) / numChunks
remainder := len(sentences) % numChunks
start := 0
for i := 0; i < numChunks; i++ {
end := start + chunkSize
if i < remainder {
end++
}
if end > len(sentences) {
end = len(sentences)
}
chunks[i] = strings.Join(sentences[start:end], ". ")
start = end
}
return chunks
}
// calculateReadabilityScore calculates a simplified Flesch-Kincaid readability score
func calculateReadabilityScore(avgSentenceLength, avgWordLength float64) float64 {
// Simplified Flesch-Kincaid formula
// Original: 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words)
// Simplified: 206.835 - 1.015 × avgSentenceLength - 84.6 × avgWordLength
score := 206.835 - 1.015*avgSentenceLength - 84.6*avgWordLength
// Clamp to reasonable range
if score < 0 {
score = 0
} else if score > 100 {
score = 100
}
return score
}
// extractKeywordsOptimized extracts keywords from text using a simplified approach
func extractKeywordsOptimized(text, language string) []Keyword {
if text == "" {
return []Keyword{}
}
tokens := tokenizeWords(text)
if len(tokens) == 0 {
return []Keyword{}
}
wordFreq := make(map[string]int, len(tokens))
for _, tok := range tokens {
if len(tok) > 2 && !isStopWord(tok, language) {
wordFreq[tok]++
}
}
total := len(tokens)
keywords := make([]Keyword, 0, len(wordFreq))
for word, freq := range wordFreq {
relevance := float64(freq) / float64(total)
if relevance > 0.01 {
keywords = append(keywords, Keyword{Text: word, Relevance: relevance})
}
}
sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
if len(keywords) > 10 {
keywords = keywords[:10]
}
return keywords
}
// estimateSentimentOptimized estimates sentiment using a simplified approach
func estimateSentimentOptimized(text, language string) float64 {
if text == "" {
return 0
}
tokens := tokenizeWords(text)
positiveCount := 0
negativeCount := 0
for _, tok := range tokens {
if isPositiveWord(tok, language) {
positiveCount++
} else if isNegativeWord(tok, language) {
negativeCount++
}
}
total := positiveCount + negativeCount
if total == 0 {
return 0
}
// Return sentiment score between -1 and 1
return float64(positiveCount-negativeCount) / float64(total)
}
// isStopWord checks if a word is a common stop word
func isStopWord(word, language string) bool {
switch language {
case "en":
_, ok := stopWordsEN[word]
return ok
default:
_, ok := stopWordsEN[word]
return ok
}
}
// isPositiveWord checks if a word is positive
func isPositiveWord(word, language string) bool {
switch language {
case "en":
_, ok := positiveEN[word]
return ok
default:
_, ok := positiveEN[word]
return ok
}
}
// isNegativeWord checks if a word is negative
func isNegativeWord(word, language string) bool {
switch language {
case "en":
_, ok := negativeEN[word]
return ok
default:
_, ok := negativeEN[word]
return ok
}
}
// tokenizeWords returns lowercase alphanumeric tokens using a single pass
func tokenizeWords(text string) []string {
if text == "" {
return nil
}
tokens := make([]string, 0, 256)
var b strings.Builder
b.Grow(32)
for _, r := range strings.ToLower(text) {
if unicode.IsLetter(r) || unicode.IsNumber(r) {
b.WriteRune(r)
} else if b.Len() > 0 {
tokens = append(tokens, b.String())
b.Reset()
}
}
if b.Len() > 0 {
tokens = append(tokens, b.String())
}
return tokens
}