mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 00:31:35 +00:00
- Core Go application with GraphQL API using gqlgen - Comprehensive data models for literary works, authors, translations - Repository pattern with caching layer - Authentication and authorization system - Linguistics analysis capabilities with multiple adapters - Vector search integration with Weaviate - Docker containerization support - Python data migration and analysis scripts - Clean architecture with proper separation of concerns - Production-ready configuration and middleware - Proper .gitignore excluding vendor/, database files, and build artifacts
330 lines
8.6 KiB
Go
330 lines
8.6 KiB
Go
package linguistics
|
||
|
||
import (
|
||
"strings"
|
||
"unicode"
|
||
"sort"
|
||
)
|
||
|
||
// Precomputed lexical resources for fast lookups
|
||
var (
|
||
stopWordsEN = map[string]struct{}{
|
||
"the": {}, "a": {}, "an": {}, "and": {}, "or": {}, "but": {},
|
||
"in": {}, "on": {}, "at": {}, "to": {}, "for": {}, "of": {},
|
||
"with": {}, "by": {}, "is": {}, "are": {}, "was": {}, "were": {},
|
||
"be": {}, "been": {}, "being": {}, "have": {}, "has": {}, "had": {},
|
||
"do": {}, "does": {}, "did": {}, "will": {}, "would": {}, "could": {},
|
||
"should": {}, "may": {}, "might": {}, "can": {}, "this": {}, "that": {},
|
||
"these": {}, "those": {}, "i": {}, "you": {}, "he": {}, "she": {},
|
||
"it": {}, "we": {}, "they": {}, "me": {}, "him": {}, "hers": {},
|
||
"us": {}, "them": {}, "my": {}, "your": {}, "his": {}, "its": {},
|
||
"our": {}, "their": {},
|
||
}
|
||
|
||
positiveEN = map[string]struct{}{
|
||
"good": {}, "great": {}, "excellent": {}, "amazing": {}, "wonderful": {},
|
||
"beautiful": {}, "love": {}, "happy": {}, "joy": {}, "success": {},
|
||
"win": {}, "winning": {}, "best": {}, "perfect": {}, "fantastic": {},
|
||
"brilliant": {}, "outstanding": {}, "superb": {}, "magnificent": {},
|
||
"delightful": {}, "pleasure": {}, "enjoy": {}, "enjoyable": {},
|
||
}
|
||
|
||
negativeEN = map[string]struct{}{
|
||
"bad": {}, "terrible": {}, "awful": {}, "horrible": {}, "disgusting": {},
|
||
"hate": {}, "sad": {}, "angry": {}, "furious": {}, "disappointed": {},
|
||
"fail": {}, "failure": {}, "lose": {}, "losing": {}, "worst": {},
|
||
"dreadful": {}, "miserable": {}, "painful": {},
|
||
"annoying": {}, "frustrating": {}, "upset": {}, "depressed": {},
|
||
}
|
||
)
|
||
|
||
// analyzeTextBasicStats performs basic text statistics analysis
|
||
func analyzeTextBasicStats(text string) (words, sentences, paragraphs int, avgWordLength float64) {
|
||
if text == "" {
|
||
return 0, 0, 0, 0
|
||
}
|
||
|
||
// Single pass scanner over runes
|
||
inWord := false
|
||
wordLen := 0
|
||
totalWordLen := 0
|
||
words = 0
|
||
sentences = 0
|
||
paragraphs = 1
|
||
|
||
prevWasNewline := false
|
||
|
||
for _, r := range text {
|
||
// Paragraphs: count double newline boundaries
|
||
if r == '\n' {
|
||
if prevWasNewline {
|
||
paragraphs++
|
||
prevWasNewline = false // avoid counting more than once for >2 newlines
|
||
} else {
|
||
prevWasNewline = true
|
||
}
|
||
} else {
|
||
prevWasNewline = false
|
||
}
|
||
|
||
// Sentences: simple heuristic on end punctuation
|
||
if r == '.' || r == '!' || r == '?' {
|
||
sentences++
|
||
}
|
||
|
||
// Words: alphanumeric sequences
|
||
if unicode.IsLetter(r) || unicode.IsNumber(r) {
|
||
inWord = true
|
||
wordLen++
|
||
} else {
|
||
if inWord {
|
||
words++
|
||
totalWordLen += wordLen
|
||
inWord = false
|
||
wordLen = 0
|
||
}
|
||
}
|
||
}
|
||
|
||
if inWord {
|
||
words++
|
||
totalWordLen += wordLen
|
||
}
|
||
|
||
if words > 0 {
|
||
avgWordLength = float64(totalWordLen) / float64(words)
|
||
}
|
||
|
||
return words, sentences, paragraphs, avgWordLength
|
||
}
|
||
|
||
// analyzeChunkBasicStats performs basic statistics on a text chunk
|
||
func analyzeChunkBasicStats(chunk string) (words, sentences, paragraphs int, wordLengthSum float64, wordCount int) {
|
||
if chunk == "" {
|
||
return 0, 0, 0, 0, 0
|
||
}
|
||
|
||
inWord := false
|
||
wordLen := 0
|
||
totalWordLen := 0
|
||
words = 0
|
||
sentences = 0
|
||
paragraphs = 1
|
||
|
||
prevWasNewline := false
|
||
|
||
for _, r := range chunk {
|
||
if r == '\n' {
|
||
if prevWasNewline {
|
||
paragraphs++
|
||
prevWasNewline = false
|
||
} else {
|
||
prevWasNewline = true
|
||
}
|
||
} else {
|
||
prevWasNewline = false
|
||
}
|
||
|
||
if r == '.' || r == '!' || r == '?' {
|
||
sentences++
|
||
}
|
||
|
||
if unicode.IsLetter(r) || unicode.IsNumber(r) {
|
||
inWord = true
|
||
wordLen++
|
||
} else {
|
||
if inWord {
|
||
words++
|
||
totalWordLen += wordLen
|
||
inWord = false
|
||
wordLen = 0
|
||
}
|
||
}
|
||
}
|
||
|
||
if inWord {
|
||
words++
|
||
totalWordLen += wordLen
|
||
}
|
||
|
||
wordLengthSum = float64(totalWordLen)
|
||
wordCount = words
|
||
return words, sentences, paragraphs, wordLengthSum, wordCount
|
||
}
|
||
|
||
// splitTextIntoChunks splits text into chunks for concurrent processing
|
||
func splitTextIntoChunks(text string, numChunks int) []string {
|
||
if numChunks <= 1 || text == "" {
|
||
return []string{text}
|
||
}
|
||
|
||
// Split by sentences to avoid breaking words
|
||
sentences := strings.FieldsFunc(text, func(r rune) bool {
|
||
return r == '.' || r == '!' || r == '?'
|
||
})
|
||
|
||
if len(sentences) == 0 {
|
||
return []string{text}
|
||
}
|
||
|
||
if numChunks > len(sentences) {
|
||
numChunks = len(sentences)
|
||
}
|
||
chunks := make([]string, numChunks)
|
||
chunkSize := len(sentences) / numChunks
|
||
remainder := len(sentences) % numChunks
|
||
|
||
start := 0
|
||
for i := 0; i < numChunks; i++ {
|
||
end := start + chunkSize
|
||
if i < remainder {
|
||
end++
|
||
}
|
||
|
||
if end > len(sentences) {
|
||
end = len(sentences)
|
||
}
|
||
|
||
chunks[i] = strings.Join(sentences[start:end], ". ")
|
||
start = end
|
||
}
|
||
|
||
return chunks
|
||
}
|
||
|
||
// calculateReadabilityScore calculates a simplified Flesch-Kincaid readability score
|
||
func calculateReadabilityScore(avgSentenceLength, avgWordLength float64) float64 {
|
||
// Simplified Flesch-Kincaid formula
|
||
// Original: 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words)
|
||
// Simplified: 206.835 - 1.015 × avgSentenceLength - 84.6 × avgWordLength
|
||
score := 206.835 - 1.015*avgSentenceLength - 84.6*avgWordLength
|
||
|
||
// Clamp to reasonable range
|
||
if score < 0 {
|
||
score = 0
|
||
} else if score > 100 {
|
||
score = 100
|
||
}
|
||
|
||
return score
|
||
}
|
||
|
||
// extractKeywordsOptimized extracts keywords from text using a simplified approach
|
||
func extractKeywordsOptimized(text, language string) []Keyword {
|
||
if text == "" {
|
||
return []Keyword{}
|
||
}
|
||
|
||
tokens := tokenizeWords(text)
|
||
if len(tokens) == 0 {
|
||
return []Keyword{}
|
||
}
|
||
|
||
wordFreq := make(map[string]int, len(tokens))
|
||
for _, tok := range tokens {
|
||
if len(tok) > 2 && !isStopWord(tok, language) {
|
||
wordFreq[tok]++
|
||
}
|
||
}
|
||
|
||
total := len(tokens)
|
||
keywords := make([]Keyword, 0, len(wordFreq))
|
||
for word, freq := range wordFreq {
|
||
relevance := float64(freq) / float64(total)
|
||
if relevance > 0.01 {
|
||
keywords = append(keywords, Keyword{Text: word, Relevance: relevance})
|
||
}
|
||
}
|
||
|
||
sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
|
||
if len(keywords) > 10 {
|
||
keywords = keywords[:10]
|
||
}
|
||
|
||
return keywords
|
||
}
|
||
|
||
// estimateSentimentOptimized estimates sentiment using a simplified approach
|
||
func estimateSentimentOptimized(text, language string) float64 {
|
||
if text == "" {
|
||
return 0
|
||
}
|
||
|
||
tokens := tokenizeWords(text)
|
||
positiveCount := 0
|
||
negativeCount := 0
|
||
for _, tok := range tokens {
|
||
if isPositiveWord(tok, language) {
|
||
positiveCount++
|
||
} else if isNegativeWord(tok, language) {
|
||
negativeCount++
|
||
}
|
||
}
|
||
|
||
total := positiveCount + negativeCount
|
||
if total == 0 {
|
||
return 0
|
||
}
|
||
|
||
// Return sentiment score between -1 and 1
|
||
return float64(positiveCount-negativeCount) / float64(total)
|
||
}
|
||
|
||
// isStopWord checks if a word is a common stop word
|
||
func isStopWord(word, language string) bool {
|
||
switch language {
|
||
case "en":
|
||
_, ok := stopWordsEN[word]
|
||
return ok
|
||
default:
|
||
_, ok := stopWordsEN[word]
|
||
return ok
|
||
}
|
||
}
|
||
|
||
// isPositiveWord checks if a word is positive
|
||
func isPositiveWord(word, language string) bool {
|
||
switch language {
|
||
case "en":
|
||
_, ok := positiveEN[word]
|
||
return ok
|
||
default:
|
||
_, ok := positiveEN[word]
|
||
return ok
|
||
}
|
||
}
|
||
|
||
// isNegativeWord checks if a word is negative
|
||
func isNegativeWord(word, language string) bool {
|
||
switch language {
|
||
case "en":
|
||
_, ok := negativeEN[word]
|
||
return ok
|
||
default:
|
||
_, ok := negativeEN[word]
|
||
return ok
|
||
}
|
||
}
|
||
|
||
// tokenizeWords returns lowercase alphanumeric tokens using a single pass
|
||
func tokenizeWords(text string) []string {
|
||
if text == "" {
|
||
return nil
|
||
}
|
||
tokens := make([]string, 0, 256)
|
||
var b strings.Builder
|
||
b.Grow(32)
|
||
for _, r := range strings.ToLower(text) {
|
||
if unicode.IsLetter(r) || unicode.IsNumber(r) {
|
||
b.WriteRune(r)
|
||
} else if b.Len() > 0 {
|
||
tokens = append(tokens, b.String())
|
||
b.Reset()
|
||
}
|
||
}
|
||
if b.Len() > 0 {
|
||
tokens = append(tokens, b.String())
|
||
}
|
||
return tokens
|
||
}
|