tercul-backend/internal/jobs/linguistics/analyzer.go
google-labs-jules[bot] 781b313bf1 feat: Complete all pending tasks from TASKS.md
This commit addresses all the high-priority tasks outlined in the TASKS.md file, significantly improving the application's observability, completing key features, and refactoring critical parts of the codebase.

### Observability

- **Centralized Logging:** Implemented a new structured, context-aware logging system using `zerolog`. A new logging middleware injects request-specific information (request ID, user ID, trace ID) into the logger, and all application logging has been refactored to use this new system.
- **Prometheus Metrics:** Added Prometheus metrics for database query performance by creating a GORM plugin that automatically records query latency and totals.
- **OpenTelemetry Tracing:** Fully instrumented all application services in `internal/app` and data repositories in `internal/data/sql` with OpenTelemetry tracing, providing deep visibility into application performance.

### Features

- **Analytics:** Implemented like, comment, and bookmark counting. The respective command handlers now call the analytics service to increment counters when these actions are performed.
- **Enrichment Tool:** Built a new, extensible `enrich` command-line tool to fetch data from external sources. The initial implementation enriches author data using the Open Library API.

### Refactoring & Fixes

- **Decoupled Testing:** Refactored the testing utilities in `internal/testutil` to be database-agnostic, promoting the use of mock-based unit tests and improving test speed and reliability.
- **Build Fixes:** Resolved numerous build errors, including a critical import cycle between the logging, observability, and authentication packages.
- **Search Service:** Fixed the search service integration by implementing the `GetWorkContent` method in the localization service, allowing the search indexer to correctly fetch and index work content.
2025-10-05 05:26:27 +00:00

173 lines
4.7 KiB
Go

package linguistics
import (
"context"
"crypto/sha256"
"encoding/hex"
"sync"
"tercul/internal/platform/cache"
"tercul/internal/platform/log"
)
// Analyzer defines the interface for linguistic analysis services
type Analyzer interface {
// AnalyzeText performs linguistic analysis on the given text
AnalyzeText(ctx context.Context, text string, language string) (*AnalysisResult, error)
// AnalyzeWork performs linguistic analysis on a work
AnalyzeWork(ctx context.Context, workID uint) error
}
// BasicAnalyzer implements the Analyzer interface as a thin coordination layer.
// It delegates pure text analysis to TextAnalyzer and work analysis to WorkAnalysisService,
// and only handles caching and orchestration concerns here to preserve SRP/DRY.
type BasicAnalyzer struct {
textAnalyzer TextAnalyzer
workAnalysisService WorkAnalysisService
cache cache.Cache
resultCache map[string]*AnalysisResult
cacheMutex sync.RWMutex
concurrency int
cacheEnabled bool
}
// NewBasicAnalyzer creates a new BasicAnalyzer
func NewBasicAnalyzer(
textAnalyzer TextAnalyzer,
workService WorkAnalysisService,
redis cache.Cache,
concurrency int,
cacheEnabled bool,
) *BasicAnalyzer {
if concurrency <= 0 {
concurrency = 4
}
return &BasicAnalyzer{
textAnalyzer: textAnalyzer,
workAnalysisService: workService,
cache: redis,
resultCache: make(map[string]*AnalysisResult),
concurrency: concurrency,
cacheEnabled: cacheEnabled,
}
}
// WithCache adds a cache to the analyzer
func (a *BasicAnalyzer) WithCache(cache cache.Cache) *BasicAnalyzer {
a.cache = cache
return a
}
// WithConcurrency sets the number of concurrent workers
func (a *BasicAnalyzer) WithConcurrency(concurrency int) *BasicAnalyzer {
if concurrency > 0 {
a.concurrency = concurrency
}
return a
}
// EnableCache enables in-memory caching of analysis results
func (a *BasicAnalyzer) EnableCache() {
a.cacheEnabled = true
}
// DisableCache disables in-memory caching of analysis results
func (a *BasicAnalyzer) DisableCache() {
a.cacheEnabled = false
}
// AnalyzeText performs basic linguistic analysis on the given text
func (a *BasicAnalyzer) AnalyzeText(ctx context.Context, text string, language string) (*AnalysisResult, error) {
logger := log.FromContext(ctx).With("language", language).With("textLength", len(text))
// Check in-memory cache first if enabled
if a.cacheEnabled {
cacheKey := makeTextCacheKey(language, text)
// Try to get from in-memory cache
a.cacheMutex.RLock()
cachedResult, found := a.resultCache[cacheKey]
a.cacheMutex.RUnlock()
if found {
logger.Debug("In-memory cache hit for text analysis")
return cachedResult, nil
}
// Try to get from Redis cache if available
if a.cache != nil {
var cachedResult AnalysisResult
err := a.cache.Get(ctx, "text_analysis:"+cacheKey, &cachedResult)
if err == nil {
logger.Debug("Redis cache hit for text analysis")
// Store in in-memory cache too
a.cacheMutex.Lock()
a.resultCache[cacheKey] = &cachedResult
a.cacheMutex.Unlock()
return &cachedResult, nil
}
}
}
// Cache miss or caching disabled, perform analysis using the pure TextAnalyzer
logger.Debug("Performing text analysis")
var (
result *AnalysisResult
err error
)
if len(text) > 10000 && a.concurrency > 1 {
result, err = a.textAnalyzer.AnalyzeTextConcurrently(ctx, text, language, a.concurrency)
} else {
result, err = a.textAnalyzer.AnalyzeText(ctx, text, language)
}
if err != nil {
return nil, err
}
// Cache the result if caching is enabled
if a.cacheEnabled {
cacheKey := makeTextCacheKey(language, text)
// Store in in-memory cache
a.cacheMutex.Lock()
a.resultCache[cacheKey] = result
a.cacheMutex.Unlock()
// Store in Redis cache if available
if a.cache != nil {
if err := a.cache.Set(ctx, "text_analysis:"+cacheKey, result, 0); err != nil {
logger.Error(err, "Failed to cache text analysis result")
}
}
}
return result, nil
}
// AnalyzeWork performs linguistic analysis on a work and stores the results
func (a *BasicAnalyzer) AnalyzeWork(ctx context.Context, workID uint) error {
// Delegate to the WorkAnalysisService to preserve single ownership
return a.workAnalysisService.AnalyzeWork(ctx, workID)
}
// Helper functions for text analysis
// min returns the minimum of two integers
func min(a, b int) int {
if a < b {
return a
}
return b
}
// Note: max was unused and has been removed to keep the code minimal and focused
// makeTextCacheKey builds a stable cache key using a content hash to avoid collisions/leaks
func makeTextCacheKey(language, text string) string {
h := sha256.Sum256([]byte(text))
return language + ":" + hex.EncodeToString(h[:])
}