mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
259 lines
7.4 KiB
Go
259 lines
7.4 KiB
Go
package linguistics
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
)
|
|
|
|
// TextAnalyzer defines the interface for pure text analysis operations
|
|
type TextAnalyzer interface {
|
|
// AnalyzeText performs linguistic analysis on the given text
|
|
AnalyzeText(ctx context.Context, text string, language string) (*AnalysisResult, error)
|
|
|
|
// AnalyzeTextConcurrently performs text analysis using concurrent processing
|
|
AnalyzeTextConcurrently(ctx context.Context, text string, language string, concurrency int) (*AnalysisResult, error)
|
|
}
|
|
|
|
// BasicTextAnalyzer implements the TextAnalyzer interface with simple algorithms
|
|
type BasicTextAnalyzer struct {
|
|
langDetector LanguageDetector
|
|
sentimentProvider SentimentProvider
|
|
keywordProvider KeywordProvider
|
|
}
|
|
|
|
// NewBasicTextAnalyzer creates a new BasicTextAnalyzer
|
|
func NewBasicTextAnalyzer() *BasicTextAnalyzer {
|
|
return &BasicTextAnalyzer{}
|
|
}
|
|
|
|
// WithLanguageDetector injects a language detector provider
|
|
func (a *BasicTextAnalyzer) WithLanguageDetector(detector LanguageDetector) *BasicTextAnalyzer {
|
|
a.langDetector = detector
|
|
return a
|
|
}
|
|
|
|
// WithSentimentProvider injects a sentiment provider
|
|
func (a *BasicTextAnalyzer) WithSentimentProvider(provider SentimentProvider) *BasicTextAnalyzer {
|
|
a.sentimentProvider = provider
|
|
return a
|
|
}
|
|
|
|
// WithKeywordProvider injects a keyword provider
|
|
func (a *BasicTextAnalyzer) WithKeywordProvider(provider KeywordProvider) *BasicTextAnalyzer {
|
|
a.keywordProvider = provider
|
|
return a
|
|
}
|
|
|
|
// AnalyzeText performs linguistic analysis on the given text
|
|
func (a *BasicTextAnalyzer) AnalyzeText(ctx context.Context, text string, language string) (*AnalysisResult, error) {
|
|
if text == "" {
|
|
return &AnalysisResult{}, nil
|
|
}
|
|
|
|
// Auto-detect language if not provided and a detector exists
|
|
if language == "" && a.langDetector != nil {
|
|
if detected, err := a.langDetector.DetectLanguage(text); err == nil {
|
|
language = detected
|
|
}
|
|
}
|
|
|
|
result := &AnalysisResult{
|
|
PartOfSpeechCounts: make(map[string]int),
|
|
Entities: []Entity{},
|
|
Keywords: []Keyword{},
|
|
Topics: []Topic{},
|
|
}
|
|
|
|
// Perform a single pass through the text for basic statistics
|
|
words, sentences, paragraphs, avgWordLength := analyzeTextBasicStats(text)
|
|
|
|
result.WordCount = words
|
|
result.SentenceCount = sentences
|
|
result.ParagraphCount = paragraphs
|
|
result.AvgWordLength = avgWordLength
|
|
|
|
// Calculate sentence length average
|
|
if result.SentenceCount > 0 {
|
|
result.AvgSentenceLength = float64(result.WordCount) / float64(result.SentenceCount)
|
|
}
|
|
|
|
// Calculate readability score (simplified Flesch-Kincaid)
|
|
result.ReadabilityScore = calculateReadabilityScore(result.AvgSentenceLength, result.AvgWordLength)
|
|
result.ReadabilityMethod = "Simplified Flesch-Kincaid"
|
|
|
|
// Extract keywords: prefer provider if available
|
|
if a.keywordProvider != nil {
|
|
if kws, err := a.keywordProvider.Extract(text, language); err == nil {
|
|
result.Keywords = kws
|
|
} else {
|
|
result.Keywords = extractKeywordsOptimized(text, language)
|
|
}
|
|
} else {
|
|
result.Keywords = extractKeywordsOptimized(text, language)
|
|
}
|
|
|
|
// Sentiment: prefer provider if available
|
|
if a.sentimentProvider != nil {
|
|
if score, err := a.sentimentProvider.Score(text, language); err == nil {
|
|
result.Sentiment = score
|
|
} else {
|
|
result.Sentiment = estimateSentimentOptimized(text, language)
|
|
}
|
|
} else {
|
|
result.Sentiment = estimateSentimentOptimized(text, language)
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// AnalyzeTextConcurrently performs text analysis using concurrent processing
|
|
func (a *BasicTextAnalyzer) AnalyzeTextConcurrently(ctx context.Context, text string, language string, concurrency int) (*AnalysisResult, error) {
|
|
if text == "" {
|
|
return &AnalysisResult{}, nil
|
|
}
|
|
|
|
// Auto-detect language if not provided and a detector exists
|
|
if language == "" && a.langDetector != nil {
|
|
if detected, err := a.langDetector.DetectLanguage(text); err == nil {
|
|
language = detected
|
|
}
|
|
}
|
|
|
|
// Split the text into chunks for concurrent processing
|
|
chunks := splitTextIntoChunks(text, concurrency)
|
|
n := len(chunks)
|
|
|
|
// Create channels for results sized by actual chunks to avoid deadlocks
|
|
wordCountCh := make(chan int, n)
|
|
sentenceCountCh := make(chan int, n)
|
|
paragraphCountCh := make(chan int, n)
|
|
wordLengthSumCh := make(chan float64, n)
|
|
wordLengthCountCh := make(chan int, n)
|
|
keywordsCh := make(chan []Keyword, n)
|
|
sentimentCh := make(chan float64, n)
|
|
|
|
// Process each chunk concurrently
|
|
var wg sync.WaitGroup
|
|
for _, chunk := range chunks {
|
|
wg.Add(1)
|
|
go func(chunkText string) {
|
|
defer wg.Done()
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Basic statistics
|
|
words, sentences, paragraphs, wordLengthSum, wordCount := analyzeChunkBasicStats(chunkText)
|
|
wordCountCh <- words
|
|
sentenceCountCh <- sentences
|
|
paragraphCountCh <- paragraphs
|
|
wordLengthSumCh <- wordLengthSum
|
|
wordLengthCountCh <- wordCount
|
|
|
|
// Keywords (provider if available)
|
|
if a.keywordProvider != nil {
|
|
if kws, err := a.keywordProvider.Extract(chunkText, language); err == nil {
|
|
keywordsCh <- kws
|
|
} else {
|
|
keywordsCh <- extractKeywordsOptimized(chunkText, language)
|
|
}
|
|
} else {
|
|
keywordsCh <- extractKeywordsOptimized(chunkText, language)
|
|
}
|
|
|
|
// Sentiment (provider if available)
|
|
if a.sentimentProvider != nil {
|
|
if score, err := a.sentimentProvider.Score(chunkText, language); err == nil {
|
|
sentimentCh <- score
|
|
} else {
|
|
sentimentCh <- estimateSentimentOptimized(chunkText, language)
|
|
}
|
|
} else {
|
|
sentimentCh <- estimateSentimentOptimized(chunkText, language)
|
|
}
|
|
}(chunk)
|
|
}
|
|
|
|
// Wait for all goroutines to complete
|
|
wg.Wait()
|
|
close(wordCountCh)
|
|
close(sentenceCountCh)
|
|
close(paragraphCountCh)
|
|
close(wordLengthSumCh)
|
|
close(wordLengthCountCh)
|
|
close(keywordsCh)
|
|
close(sentimentCh)
|
|
|
|
// Aggregate results
|
|
result := &AnalysisResult{
|
|
PartOfSpeechCounts: make(map[string]int),
|
|
Entities: []Entity{},
|
|
Keywords: []Keyword{},
|
|
Topics: []Topic{},
|
|
}
|
|
|
|
// Sum up basic statistics
|
|
for wc := range wordCountCh {
|
|
result.WordCount += wc
|
|
}
|
|
for sc := range sentenceCountCh {
|
|
result.SentenceCount += sc
|
|
}
|
|
for pc := range paragraphCountCh {
|
|
result.ParagraphCount += pc
|
|
}
|
|
|
|
// Calculate average word length
|
|
var totalWordLengthSum float64
|
|
var totalWordCount int
|
|
for wls := range wordLengthSumCh {
|
|
totalWordLengthSum += wls
|
|
}
|
|
for wlc := range wordLengthCountCh {
|
|
totalWordCount += wlc
|
|
}
|
|
if totalWordCount > 0 {
|
|
result.AvgWordLength = totalWordLengthSum / float64(totalWordCount)
|
|
}
|
|
|
|
// Calculate sentence length average
|
|
if result.SentenceCount > 0 {
|
|
result.AvgSentenceLength = float64(result.WordCount) / float64(result.SentenceCount)
|
|
}
|
|
|
|
// Calculate readability score
|
|
result.ReadabilityScore = calculateReadabilityScore(result.AvgSentenceLength, result.AvgWordLength)
|
|
result.ReadabilityMethod = "Simplified Flesch-Kincaid"
|
|
|
|
// Merge keywords
|
|
keywordSum := make(map[string]float64)
|
|
keywordCount := make(map[string]int)
|
|
for kws := range keywordsCh {
|
|
for _, kw := range kws {
|
|
keywordSum[kw.Text] += kw.Relevance
|
|
keywordCount[kw.Text]++
|
|
}
|
|
}
|
|
for text, sum := range keywordSum {
|
|
cnt := keywordCount[text]
|
|
if cnt > 0 {
|
|
result.Keywords = append(result.Keywords, Keyword{Text: text, Relevance: sum / float64(cnt)})
|
|
}
|
|
}
|
|
|
|
// Average sentiment
|
|
var totalSentiment float64
|
|
var sentimentCount int
|
|
for s := range sentimentCh {
|
|
totalSentiment += s
|
|
sentimentCount++
|
|
}
|
|
if sentimentCount > 0 {
|
|
result.Sentiment = totalSentiment / float64(sentimentCount)
|
|
}
|
|
|
|
return result, nil
|
|
}
|