tercul-backend/internal/jobs/linguistics/lemmatizer.go
google-labs-jules[bot] 53aa4d0344
Security Hardening and GraphQL Caching (#69)
* feat: add security middleware, graphql apq, and improved linting

- Add RateLimit, RequestValidation, and CORS middleware.
- Configure middleware chain in API server.
- Implement Redis cache for GraphQL Automatic Persisted Queries.
- Add .golangci.yml and fix linting issues (shadowing, timeouts).

* feat: security, caching and linting config

- Fix .golangci.yml config for govet shadow check
- (Previous changes: Security middleware, GraphQL APQ, Linting fixes)

* fix: resolve remaining lint errors

- Fix unhandled errors in tests (errcheck)
- Define constants for repeated strings (goconst)
- Suppress high complexity warnings with nolint:gocyclo
- Fix integer overflow warnings (gosec)
- Add package comments
- Split long lines (lll)
- Rename Analyse -> Analyze (misspell)
- Fix naked returns and unused params

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
2025-12-01 00:14:22 +01:00

174 lines
4.5 KiB
Go

package linguistics
import (
"strings"
)
// Lemmatizer finds the base form (lemma) of words
type Lemmatizer struct{}
// NewLemmatizer creates a new Lemmatizer
func NewLemmatizer() *Lemmatizer {
return &Lemmatizer{}
}
// Lemma finds the base form (lemma) of a word and returns it
func (l *Lemmatizer) Lemma(word string, language string) (string, error) {
// This is a simplified implementation
// In a real-world scenario, you would use a library like github.com/jdkato/prose
// or call an external API for lemmatization
// Convert to lowercase
word = strings.ToLower(word)
// Handle different languages
switch language {
case "en":
return englishLemma(word), nil
case "es":
return spanishLemma(word), nil
case "fr":
return frenchLemma(word), nil
default:
// Default to English
return englishLemma(word), nil
}
}
// englishLemma finds the base form of an English word
//nolint:gocyclo // Large switch case
func englishLemma(word string) string {
// Check for irregular verbs
irregularVerbs := map[string]string{
"am": "be",
"are": "be",
"is": "be",
"was": "be",
"were": "be",
"been": "be",
"have": "have",
"has": "have",
"had": "have",
"do": "do",
"does": "do",
"did": "do",
"done": "do",
"go": "go",
"goes": "go",
"went": "go",
"gone": "go",
"get": "get",
"gets": "get",
"got": "get",
"gotten": "get",
"make": "make",
"makes": "make",
"made": "make",
"say": "say",
"says": "say",
"said": "say",
"see": "see",
"sees": "see",
"saw": "see",
"seen": "see",
"come": "come",
"comes": "come",
"came": "come",
"take": "take",
"takes": "take",
"took": "take",
"taken": "take",
"know": "know",
"knows": "know",
"knew": "know",
"known": "know",
"think": "think",
"thinks": "think",
"thought": "think",
}
if lemma, ok := irregularVerbs[word]; ok {
return lemma
}
// Check for plural nouns
if strings.HasSuffix(word, "s") && len(word) > 2 {
// Check for common plural endings
if strings.HasSuffix(word, "ies") && len(word) > 3 {
return word[:len(word)-3] + "y"
} else if strings.HasSuffix(word, "es") && len(word) > 2 {
return word[:len(word)-2]
} else if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
}
}
// Check for verb forms
if strings.HasSuffix(word, "ing") && len(word) > 3 {
// Check for doubled consonant
if len(word) > 4 && word[len(word)-4] == word[len(word)-5] {
return word[:len(word)-4]
}
return word[:len(word)-3]
} else if strings.HasSuffix(word, "ed") && len(word) > 2 {
// Check for doubled consonant
if len(word) > 3 && word[len(word)-3] == word[len(word)-4] {
return word[:len(word)-3]
}
return word[:len(word)-2]
}
// Return the original word if no rules apply
return word
}
// spanishLemma finds the base form of a Spanish word
func spanishLemma(word string) string {
// Simplified implementation for Spanish
// In a real-world scenario, you would use a more comprehensive approach
// Check for verb endings
if strings.HasSuffix(word, "ar") || strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") {
return word
} else if strings.HasSuffix(word, "ando") || strings.HasSuffix(word, "endo") {
return word[:len(word)-4]
} else if strings.HasSuffix(word, "ado") || strings.HasSuffix(word, "ido") {
return word[:len(word)-3]
}
// Check for plural nouns
if strings.HasSuffix(word, "es") && len(word) > 2 {
return word[:len(word)-2]
} else if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
}
// Return the original word if no rules apply
return word
}
// frenchLemma finds the base form of a French word
func frenchLemma(word string) string {
// Simplified implementation for French
// In a real-world scenario, you would use a more comprehensive approach
// Check for verb endings
if strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") || strings.HasSuffix(word, "re") {
return word
} else if strings.HasSuffix(word, "ant") || strings.HasSuffix(word, "ent") {
return word[:len(word)-3]
} else if strings.HasSuffix(word, "é") || strings.HasSuffix(word, "i") {
return word[:len(word)-1]
}
// Check for plural nouns
if strings.HasSuffix(word, "s") && len(word) > 1 {
return word[:len(word)-1]
} else if strings.HasSuffix(word, "x") && len(word) > 1 {
return word[:len(word)-1]
}
// Return the original word if no rules apply
return word
}