tercul-backend/internal/jobs/linguistics/lemmatizer.go
Damir Mukimov d50722dad5
Some checks failed
Test / Integration Tests (push) Successful in 4s
Build / Build Binary (push) Failing after 2m9s
Docker Build / Build Docker Image (push) Failing after 2m32s
Test / Unit Tests (push) Failing after 3m12s
Lint / Go Lint (push) Failing after 1m0s
Refactor ID handling to use UUIDs across the application
- Updated database models and repositories to replace uint IDs with UUIDs.
- Modified test fixtures to generate and use UUIDs for authors, translations, users, and works.
- Adjusted mock implementations to align with the new UUID structure.
- Ensured all relevant functions and methods are updated to handle UUIDs correctly.
- Added necessary imports for UUID handling in various files.
2025-12-27 00:33:34 +01:00

175 lines
4.5 KiB
Go

package linguistics
import (
"strings"
)
// Lemmatizer finds the base form (lemma) of words
type Lemmatizer struct{}
// NewLemmatizer creates a new Lemmatizer
func NewLemmatizer() *Lemmatizer {
return &Lemmatizer{}
}
// Lemma finds the base form (lemma) of a word and returns it
func (l *Lemmatizer) Lemma(word string, language string) (string, error) {
// This is a simplified implementation
// In a real-world scenario, you would use a library like github.com/jdkato/prose
// or call an external API for lemmatization
// Convert to lowercase
word = strings.ToLower(word)
// Handle different languages
switch language {
case "en":
return englishLemma(word), nil
case "es":
return spanishLemma(word), nil
case "fr":
return frenchLemma(word), nil
default:
// Default to English
return englishLemma(word), nil
}
}
// englishLemma finds the base form of an English word
//
//nolint:gocyclo // Large switch case
func englishLemma(word string) string {
// Check for irregular verbs
irregularVerbs := map[string]string{
"am": "be",
"are": "be",
"is": "be",
"was": "be",
"were": "be",
"been": "be",
"have": "have",
"has": "have",
"had": "have",
"do": "do",
"does": "do",
"did": "do",
"done": "do",
"go": "go",
"goes": "go",
"went": "go",
"gone": "go",
"get": "get",
"gets": "get",
"got": "get",
"gotten": "get",
"make": "make",
"makes": "make",
"made": "make",
"say": "say",
"says": "say",
"said": "say",
"see": "see",
"sees": "see",
"saw": "see",
"seen": "see",
"come": "come",
"comes": "come",
"came": "come",
"take": "take",
"takes": "take",
"took": "take",
"taken": "take",
"know": "know",
"knows": "know",
"knew": "know",
"known": "know",
"think": "think",
"thinks": "think",
"thought": "think",
}
if lemma, ok := irregularVerbs[word]; ok {
return lemma
}
// Check for plural nouns
if strings.HasSuffix(word, "s") && len(word) > 2 {
// Check for common plural endings
if strings.HasSuffix(word, "ies") && len(word) > 3 {
return word[:len(word)-3] + "y"
} else if strings.HasSuffix(word, "es") && len(word) > 2 {
return word[:len(word)-2]
} else if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
}
}
// Check for verb forms
if strings.HasSuffix(word, "ing") && len(word) > 3 {
// Check for doubled consonant
if len(word) > 4 && word[len(word)-4] == word[len(word)-5] {
return word[:len(word)-4]
}
return word[:len(word)-3]
} else if strings.HasSuffix(word, "ed") && len(word) > 2 {
// Check for doubled consonant
if len(word) > 3 && word[len(word)-3] == word[len(word)-4] {
return word[:len(word)-3]
}
return word[:len(word)-2]
}
// Return the original word if no rules apply
return word
}
// spanishLemma finds the base form of a Spanish word
func spanishLemma(word string) string {
// Simplified implementation for Spanish
// In a real-world scenario, you would use a more comprehensive approach
// Check for verb endings
if strings.HasSuffix(word, "ar") || strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") {
return word
} else if strings.HasSuffix(word, "ando") || strings.HasSuffix(word, "endo") {
return word[:len(word)-4]
} else if strings.HasSuffix(word, "ado") || strings.HasSuffix(word, "ido") {
return word[:len(word)-3]
}
// Check for plural nouns
if strings.HasSuffix(word, "es") && len(word) > 2 {
return word[:len(word)-2]
} else if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
}
// Return the original word if no rules apply
return word
}
// frenchLemma finds the base form of a French word
func frenchLemma(word string) string {
// Simplified implementation for French
// In a real-world scenario, you would use a more comprehensive approach
// Check for verb endings
if strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") || strings.HasSuffix(word, "re") {
return word
} else if strings.HasSuffix(word, "ant") || strings.HasSuffix(word, "ent") {
return word[:len(word)-3]
} else if strings.HasSuffix(word, "é") || strings.HasSuffix(word, "i") {
return word[:len(word)-1]
}
// Check for plural nouns
if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
} else if strings.HasSuffix(word, "x") && len(word) > 1 {
return word[:len(word)-1]
}
// Return the original word if no rules apply
return word
}