tercul-backend/internal/enrich/lemmatizer.go
Damir Mukimov 4957117cb6 Initial commit: Tercul Go project with comprehensive architecture
- Core Go application with GraphQL API using gqlgen
- Comprehensive data models for literary works, authors, translations
- Repository pattern with caching layer
- Authentication and authorization system
- Linguistics analysis capabilities with multiple adapters
- Vector search integration with Weaviate
- Docker containerization support
- Python data migration and analysis scripts
- Clean architecture with proper separation of concerns
- Production-ready configuration and middleware
- Proper .gitignore excluding vendor/, database files, and build artifacts
2025-08-13 07:42:32 +02:00

173 lines
4.4 KiB
Go

package enrich
import (
"strings"
)
// Lemmatizer finds the base form (lemma) of words
type Lemmatizer struct{}
// NewLemmatizer creates a new Lemmatizer
func NewLemmatizer() *Lemmatizer {
return &Lemmatizer{}
}
// Lemma finds the base form (lemma) of a word and returns it
func (l *Lemmatizer) Lemma(word string, language string) (string, error) {
// This is a simplified implementation
// In a real-world scenario, you would use a library like github.com/jdkato/prose
// or call an external API for lemmatization
// Convert to lowercase
word = strings.ToLower(word)
// Handle different languages
switch language {
case "en":
return englishLemma(word), nil
case "es":
return spanishLemma(word), nil
case "fr":
return frenchLemma(word), nil
default:
// Default to English
return englishLemma(word), nil
}
}
// englishLemma finds the base form of an English word
func englishLemma(word string) string {
// Check for irregular verbs
irregularVerbs := map[string]string{
"am": "be",
"are": "be",
"is": "be",
"was": "be",
"were": "be",
"been": "be",
"have": "have",
"has": "have",
"had": "have",
"do": "do",
"does": "do",
"did": "do",
"done": "do",
"go": "go",
"goes": "go",
"went": "go",
"gone": "go",
"get": "get",
"gets": "get",
"got": "get",
"gotten": "get",
"make": "make",
"makes": "make",
"made": "make",
"say": "say",
"says": "say",
"said": "say",
"see": "see",
"sees": "see",
"saw": "see",
"seen": "see",
"come": "come",
"comes": "come",
"came": "come",
"take": "take",
"takes": "take",
"took": "take",
"taken": "take",
"know": "know",
"knows": "know",
"knew": "know",
"known": "know",
"think": "think",
"thinks": "think",
"thought": "think",
}
if lemma, ok := irregularVerbs[word]; ok {
return lemma
}
// Check for plural nouns
if strings.HasSuffix(word, "s") && len(word) > 2 {
// Check for common plural endings
if strings.HasSuffix(word, "ies") && len(word) > 3 {
return word[:len(word)-3] + "y"
} else if strings.HasSuffix(word, "es") && len(word) > 2 {
return word[:len(word)-2]
} else if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
}
}
// Check for verb forms
if strings.HasSuffix(word, "ing") && len(word) > 3 {
// Check for doubled consonant
if len(word) > 4 && word[len(word)-4] == word[len(word)-5] {
return word[:len(word)-4]
}
return word[:len(word)-3]
} else if strings.HasSuffix(word, "ed") && len(word) > 2 {
// Check for doubled consonant
if len(word) > 3 && word[len(word)-3] == word[len(word)-4] {
return word[:len(word)-3]
}
return word[:len(word)-2]
}
// Return the original word if no rules apply
return word
}
// spanishLemma finds the base form of a Spanish word
func spanishLemma(word string) string {
// Simplified implementation for Spanish
// In a real-world scenario, you would use a more comprehensive approach
// Check for verb endings
if strings.HasSuffix(word, "ar") || strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") {
return word
} else if strings.HasSuffix(word, "ando") || strings.HasSuffix(word, "endo") {
return word[:len(word)-4]
} else if strings.HasSuffix(word, "ado") || strings.HasSuffix(word, "ido") {
return word[:len(word)-3]
}
// Check for plural nouns
if strings.HasSuffix(word, "es") && len(word) > 2 {
return word[:len(word)-2]
} else if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
}
// Return the original word if no rules apply
return word
}
// frenchLemma finds the base form of a French word
func frenchLemma(word string) string {
// Simplified implementation for French
// In a real-world scenario, you would use a more comprehensive approach
// Check for verb endings
if strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") || strings.HasSuffix(word, "re") {
return word
} else if strings.HasSuffix(word, "ant") || strings.HasSuffix(word, "ent") {
return word[:len(word)-3]
} else if strings.HasSuffix(word, "é") || strings.HasSuffix(word, "i") {
return word[:len(word)-1]
}
// Check for plural nouns
if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
} else if strings.HasSuffix(word, "x") && len(word) > 1 {
return word[:len(word)-1]
}
// Return the original word if no rules apply
return word
}