tercul-backend/linguistics/text_analyzer.go
Damir Mukimov fa336cacf3
wip
2025-09-01 00:43:59 +02:00

259 lines
7.4 KiB
Go

package linguistics
import (
"context"
"sync"
)
// TextAnalyzer defines the interface for pure text analysis operations
type TextAnalyzer interface {
// AnalyzeText performs linguistic analysis on the given text
AnalyzeText(ctx context.Context, text string, language string) (*AnalysisResult, error)
// AnalyzeTextConcurrently performs text analysis using concurrent processing
AnalyzeTextConcurrently(ctx context.Context, text string, language string, concurrency int) (*AnalysisResult, error)
}
// BasicTextAnalyzer implements the TextAnalyzer interface with simple algorithms
type BasicTextAnalyzer struct {
langDetector LanguageDetector
sentimentProvider SentimentProvider
keywordProvider KeywordProvider
}
// NewBasicTextAnalyzer creates a new BasicTextAnalyzer
func NewBasicTextAnalyzer() *BasicTextAnalyzer {
return &BasicTextAnalyzer{}
}
// WithLanguageDetector injects a language detector provider
func (a *BasicTextAnalyzer) WithLanguageDetector(detector LanguageDetector) *BasicTextAnalyzer {
a.langDetector = detector
return a
}
// WithSentimentProvider injects a sentiment provider
func (a *BasicTextAnalyzer) WithSentimentProvider(provider SentimentProvider) *BasicTextAnalyzer {
a.sentimentProvider = provider
return a
}
// WithKeywordProvider injects a keyword provider
func (a *BasicTextAnalyzer) WithKeywordProvider(provider KeywordProvider) *BasicTextAnalyzer {
a.keywordProvider = provider
return a
}
// AnalyzeText performs linguistic analysis on the given text
func (a *BasicTextAnalyzer) AnalyzeText(ctx context.Context, text string, language string) (*AnalysisResult, error) {
if text == "" {
return &AnalysisResult{}, nil
}
// Auto-detect language if not provided and a detector exists
if language == "" && a.langDetector != nil {
if detected, ok := a.langDetector.DetectLanguage(text); ok {
language = detected
}
}
result := &AnalysisResult{
PartOfSpeechCounts: make(map[string]int),
Entities: []Entity{},
Keywords: []Keyword{},
Topics: []Topic{},
}
// Perform a single pass through the text for basic statistics
words, sentences, paragraphs, avgWordLength := analyzeTextBasicStats(text)
result.WordCount = words
result.SentenceCount = sentences
result.ParagraphCount = paragraphs
result.AvgWordLength = avgWordLength
// Calculate sentence length average
if result.SentenceCount > 0 {
result.AvgSentenceLength = float64(result.WordCount) / float64(result.SentenceCount)
}
// Calculate readability score (simplified Flesch-Kincaid)
result.ReadabilityScore = calculateReadabilityScore(result.AvgSentenceLength, result.AvgWordLength)
result.ReadabilityMethod = "Simplified Flesch-Kincaid"
// Extract keywords: prefer provider if available
if a.keywordProvider != nil {
if kws, err := a.keywordProvider.Extract(text, language); err == nil {
result.Keywords = kws
} else {
result.Keywords = extractKeywordsOptimized(text, language)
}
} else {
result.Keywords = extractKeywordsOptimized(text, language)
}
// Sentiment: prefer provider if available
if a.sentimentProvider != nil {
if score, err := a.sentimentProvider.Score(text, language); err == nil {
result.Sentiment = score
} else {
result.Sentiment = estimateSentimentOptimized(text, language)
}
} else {
result.Sentiment = estimateSentimentOptimized(text, language)
}
return result, nil
}
// AnalyzeTextConcurrently performs text analysis using concurrent processing
func (a *BasicTextAnalyzer) AnalyzeTextConcurrently(ctx context.Context, text string, language string, concurrency int) (*AnalysisResult, error) {
if text == "" {
return &AnalysisResult{}, nil
}
// Auto-detect language if not provided and a detector exists
if language == "" && a.langDetector != nil {
if detected, ok := a.langDetector.DetectLanguage(text); ok {
language = detected
}
}
// Split the text into chunks for concurrent processing
chunks := splitTextIntoChunks(text, concurrency)
n := len(chunks)
// Create channels for results sized by actual chunks to avoid deadlocks
wordCountCh := make(chan int, n)
sentenceCountCh := make(chan int, n)
paragraphCountCh := make(chan int, n)
wordLengthSumCh := make(chan float64, n)
wordLengthCountCh := make(chan int, n)
keywordsCh := make(chan []Keyword, n)
sentimentCh := make(chan float64, n)
// Process each chunk concurrently
var wg sync.WaitGroup
for _, chunk := range chunks {
wg.Add(1)
go func(chunkText string) {
defer wg.Done()
select {
case <-ctx.Done():
return
default:
}
// Basic statistics
words, sentences, paragraphs, wordLengthSum, wordCount := analyzeChunkBasicStats(chunkText)
wordCountCh <- words
sentenceCountCh <- sentences
paragraphCountCh <- paragraphs
wordLengthSumCh <- wordLengthSum
wordLengthCountCh <- wordCount
// Keywords (provider if available)
if a.keywordProvider != nil {
if kws, err := a.keywordProvider.Extract(chunkText, language); err == nil {
keywordsCh <- kws
} else {
keywordsCh <- extractKeywordsOptimized(chunkText, language)
}
} else {
keywordsCh <- extractKeywordsOptimized(chunkText, language)
}
// Sentiment (provider if available)
if a.sentimentProvider != nil {
if score, err := a.sentimentProvider.Score(chunkText, language); err == nil {
sentimentCh <- score
} else {
sentimentCh <- estimateSentimentOptimized(chunkText, language)
}
} else {
sentimentCh <- estimateSentimentOptimized(chunkText, language)
}
}(chunk)
}
// Wait for all goroutines to complete
wg.Wait()
close(wordCountCh)
close(sentenceCountCh)
close(paragraphCountCh)
close(wordLengthSumCh)
close(wordLengthCountCh)
close(keywordsCh)
close(sentimentCh)
// Aggregate results
result := &AnalysisResult{
PartOfSpeechCounts: make(map[string]int),
Entities: []Entity{},
Keywords: []Keyword{},
Topics: []Topic{},
}
// Sum up basic statistics
for wc := range wordCountCh {
result.WordCount += wc
}
for sc := range sentenceCountCh {
result.SentenceCount += sc
}
for pc := range paragraphCountCh {
result.ParagraphCount += pc
}
// Calculate average word length
var totalWordLengthSum float64
var totalWordCount int
for wls := range wordLengthSumCh {
totalWordLengthSum += wls
}
for wlc := range wordLengthCountCh {
totalWordCount += wlc
}
if totalWordCount > 0 {
result.AvgWordLength = totalWordLengthSum / float64(totalWordCount)
}
// Calculate sentence length average
if result.SentenceCount > 0 {
result.AvgSentenceLength = float64(result.WordCount) / float64(result.SentenceCount)
}
// Calculate readability score
result.ReadabilityScore = calculateReadabilityScore(result.AvgSentenceLength, result.AvgWordLength)
result.ReadabilityMethod = "Simplified Flesch-Kincaid"
// Merge keywords
keywordSum := make(map[string]float64)
keywordCount := make(map[string]int)
for kws := range keywordsCh {
for _, kw := range kws {
keywordSum[kw.Text] += kw.Relevance
keywordCount[kw.Text]++
}
}
for text, sum := range keywordSum {
cnt := keywordCount[text]
if cnt > 0 {
result.Keywords = append(result.Keywords, Keyword{Text: text, Relevance: sum / float64(cnt)})
}
}
// Average sentiment
var totalSentiment float64
var sentimentCount int
for s := range sentimentCh {
totalSentiment += s
sentimentCount++
}
if sentimentCount > 0 {
result.Sentiment = totalSentiment / float64(sentimentCount)
}
return result, nil
}