tercul-backend/internal/jobs/linguistics/text_utils.go
google-labs-jules[bot] 53aa4d0344
Security Hardening and GraphQL Caching (#69)
* feat: add security middleware, graphql apq, and improved linting

- Add RateLimit, RequestValidation, and CORS middleware.
- Configure middleware chain in API server.
- Implement Redis cache for GraphQL Automatic Persisted Queries.
- Add .golangci.yml and fix linting issues (shadowing, timeouts).

* feat: security, caching and linting config

- Fix .golangci.yml config for govet shadow check
- (Previous changes: Security middleware, GraphQL APQ, Linting fixes)

* fix: resolve remaining lint errors

- Fix unhandled errors in tests (errcheck)
- Define constants for repeated strings (goconst)
- Suppress high complexity warnings with nolint:gocyclo
- Fix integer overflow warnings (gosec)
- Add package comments
- Split long lines (lll)
- Rename Analyse -> Analyze (misspell)
- Fix naked returns and unused params

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
2025-12-01 00:14:22 +01:00

282 lines
7.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package linguistics
import (
"sort"
"strings"
"unicode"
)
// Precomputed lexical resources for fast lookups
var (
stopWordsEN = map[string]struct{}{
"the": {}, "a": {}, "an": {}, "and": {}, "or": {}, "but": {},
"in": {}, "on": {}, "at": {}, "to": {}, "for": {}, "of": {},
"with": {}, "by": {}, "is": {}, "are": {}, "was": {}, "were": {},
"be": {}, "been": {}, "being": {}, "have": {}, "has": {}, "had": {},
"do": {}, "does": {}, "did": {}, "will": {}, "would": {}, "could": {},
"should": {}, "may": {}, "might": {}, "can": {}, "this": {}, "that": {},
"these": {}, "those": {}, "i": {}, "you": {}, "he": {}, "she": {},
"it": {}, "we": {}, "they": {}, "me": {}, "him": {}, "hers": {}, "over": {},
"us": {}, "them": {}, "my": {}, "your": {}, "his": {}, "its": {},
"our": {}, "their": {},
}
positiveEN = map[string]struct{}{
"good": {}, "great": {}, "excellent": {}, "amazing": {}, "wonderful": {},
"beautiful": {}, "love": {}, "happy": {}, "joy": {}, "success": {},
"win": {}, "winning": {}, "best": {}, "perfect": {}, "fantastic": {},
"brilliant": {}, "outstanding": {}, "superb": {}, "magnificent": {},
"delightful": {}, "pleasure": {}, "enjoy": {}, "enjoyable": {},
}
negativeEN = map[string]struct{}{
"bad": {}, "terrible": {}, "awful": {}, "horrible": {}, "disgusting": {},
"hate": {}, "sad": {}, "angry": {}, "furious": {}, "disappointed": {},
"fail": {}, "failure": {}, "lose": {}, "losing": {}, "worst": {},
"dreadful": {}, "miserable": {}, "painful": {},
"annoying": {}, "frustrating": {}, "upset": {}, "depressed": {},
}
)
// analyzeTextBasicStats performs basic text statistics analysis
func analyzeTextBasicStats(text string) (words, sentences, paragraphs int, avgWordLength float64) {
if text == "" {
return 0, 0, 0, 0
}
words, sentences, paragraphs, totalWordLen := scanTextStats(text)
if words > 0 {
avgWordLength = float64(totalWordLen) / float64(words)
}
return words, sentences, paragraphs, avgWordLength
}
// analyzeChunkBasicStats performs basic statistics on a text chunk
func analyzeChunkBasicStats(chunk string) (words, sentences, paragraphs int, wordLengthSum float64, wordCount int) {
if chunk == "" {
return 0, 0, 0, 0, 0
}
words, sentences, paragraphs, totalWordLen := scanTextStats(chunk)
return words, sentences, paragraphs, float64(totalWordLen), words
}
// scanTextStats is a shared core that scans text and returns words, sentences, paragraphs and totalWordLen
func scanTextStats[T ~string](text T) (words int, sentences int, paragraphs int, totalWordLen int) {
if len(text) == 0 {
return 0, 0, 0, 0
}
inWord := false
wordLen := 0
words = 0
sentences = 0
paragraphs = 1
prevWasNewline := false
for _, r := range text {
if r == '\n' {
if prevWasNewline {
paragraphs++
prevWasNewline = false
} else {
prevWasNewline = true
}
} else {
prevWasNewline = false
}
if r == '.' || r == '!' || r == '?' {
sentences++
}
if unicode.IsLetter(r) || unicode.IsNumber(r) {
inWord = true
wordLen++
} else if inWord {
words++
totalWordLen += wordLen
inWord = false
wordLen = 0
}
}
if inWord {
words++
totalWordLen += wordLen
}
return words, sentences, paragraphs, totalWordLen
}
// splitTextIntoChunks splits text into chunks for concurrent processing
func splitTextIntoChunks(text string, numChunks int) []string {
if numChunks <= 1 || text == "" {
return []string{text}
}
// Split by sentences to avoid breaking words
sentences := strings.FieldsFunc(text, func(r rune) bool {
return r == '.' || r == '!' || r == '?'
})
if len(sentences) == 0 {
return []string{text}
}
if numChunks > len(sentences) {
numChunks = len(sentences)
}
chunks := make([]string, numChunks)
chunkSize := len(sentences) / numChunks
remainder := len(sentences) % numChunks
start := 0
for i := 0; i < numChunks; i++ {
end := start + chunkSize
if i < remainder {
end++
}
if end > len(sentences) {
end = len(sentences)
}
chunks[i] = strings.Join(sentences[start:end], ". ")
start = end
}
return chunks
}
// calculateReadabilityScore calculates a simplified Flesch-Kincaid readability score
func calculateReadabilityScore(avgSentenceLength, avgWordLength float64) float64 {
// Simplified Flesch-Kincaid formula
// Original: 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words)
// Simplified: 206.835 - 1.015 × avgSentenceLength - 84.6 × avgWordLength
score := 206.835 - 1.015*avgSentenceLength - 84.6*avgWordLength
// Clamp to reasonable range
if score < 0 {
score = 0
} else if score > 100 {
score = 100
}
return score
}
// extractKeywordsOptimized extracts keywords from text using a simplified approach
func extractKeywordsOptimized(text, language string) []Keyword {
if text == "" {
return []Keyword{}
}
tokens := tokenizeWords(text)
if len(tokens) == 0 {
return []Keyword{}
}
wordFreq := make(map[string]int, len(tokens))
for _, tok := range tokens {
if len(tok) > 2 && !isStopWord(tok, language) {
wordFreq[tok]++
}
}
total := len(tokens)
keywords := make([]Keyword, 0, len(wordFreq))
for word, freq := range wordFreq {
relevance := float64(freq) / float64(total)
if relevance > 0.01 {
keywords = append(keywords, Keyword{Text: word, Relevance: relevance})
}
}
sort.Slice(keywords, func(i, j int) bool { return keywords[i].Relevance > keywords[j].Relevance })
if len(keywords) > 10 {
keywords = keywords[:10]
}
return keywords
}
// estimateSentimentOptimized estimates sentiment using a simplified approach
func estimateSentimentOptimized(text, language string) float64 {
if text == "" {
return 0
}
tokens := tokenizeWords(text)
positiveCount := 0
negativeCount := 0
for _, tok := range tokens {
if isPositiveWord(tok, language) {
positiveCount++
} else if isNegativeWord(tok, language) {
negativeCount++
}
}
total := positiveCount + negativeCount
if total == 0 {
return 0
}
// Return sentiment score between -1 and 1
return float64(positiveCount-negativeCount) / float64(total)
}
// isStopWord checks if a word is a common stop word
func isStopWord(word, language string) bool {
switch language {
case "en":
_, ok := stopWordsEN[word]
return ok
default:
_, ok := stopWordsEN[word]
return ok
}
}
// isPositiveWord checks if a word is positive
func isPositiveWord(word, language string) bool {
switch language {
case "en":
_, ok := positiveEN[word]
return ok
default:
_, ok := positiveEN[word]
return ok
}
}
// isNegativeWord checks if a word is negative
func isNegativeWord(word, language string) bool {
switch language {
case "en":
_, ok := negativeEN[word]
return ok
default:
_, ok := negativeEN[word]
return ok
}
}
// tokenizeWords returns lowercase alphanumeric tokens using a single pass
func tokenizeWords(text string) []string {
if text == "" {
return nil
}
tokens := make([]string, 0, 256)
var b strings.Builder
b.Grow(32)
for _, r := range text {
lr := unicode.ToLower(r)
if unicode.IsLetter(lr) || unicode.IsNumber(lr) {
b.WriteRune(lr)
} else if b.Len() > 0 {
tokens = append(tokens, b.String())
b.Reset()
}
}
if b.Len() > 0 {
tokens = append(tokens, b.String())
}
return tokens
}