tercul-backend/internal/jobs/linguistics/lemmatizer.go

package linguistics

import (
	"strings"
)

// Lemmatizer finds the base form (lemma) of words
type Lemmatizer struct{}

// NewLemmatizer creates a new Lemmatizer
func NewLemmatizer() *Lemmatizer {
	return &Lemmatizer{}
}

// Lemma finds the base form (lemma) of a word and returns it
func (l *Lemmatizer) Lemma(word string, language string) (string, error) {
	// This is a simplified implementation
	// In a real-world scenario, you would use a library like github.com/jdkato/prose
	// or call an external API for lemmatization

	// Convert to lowercase
	word = strings.ToLower(word)

	// Handle different languages
	switch language {
	case "en":
		return englishLemma(word), nil
	case "es":
		return spanishLemma(word), nil
	case "fr":
		return frenchLemma(word), nil
	default:
		// Default to English
		return englishLemma(word), nil
	}
}

// englishLemma finds the base form of an English word
//
//nolint:gocyclo // Large switch case
func englishLemma(word string) string {
	// Check for irregular verbs
	irregularVerbs := map[string]string{
		"am":      "be",
		"are":     "be",
		"is":      "be",
		"was":     "be",
		"were":    "be",
		"been":    "be",
		"have":    "have",
		"has":     "have",
		"had":     "have",
		"do":      "do",
		"does":    "do",
		"did":     "do",
		"done":    "do",
		"go":      "go",
		"goes":    "go",
		"went":    "go",
		"gone":    "go",
		"get":     "get",
		"gets":    "get",
		"got":     "get",
		"gotten":  "get",
		"make":    "make",
		"makes":   "make",
		"made":    "make",
		"say":     "say",
		"says":    "say",
		"said":    "say",
		"see":     "see",
		"sees":    "see",
		"saw":     "see",
		"seen":    "see",
		"come":    "come",
		"comes":   "come",
		"came":    "come",
		"take":    "take",
		"takes":   "take",
		"took":    "take",
		"taken":   "take",
		"know":    "know",
		"knows":   "know",
		"knew":    "know",
		"known":   "know",
		"think":   "think",
		"thinks":  "think",
		"thought": "think",
	}

	if lemma, ok := irregularVerbs[word]; ok {
		return lemma
	}

	// Check for plural nouns
	if strings.HasSuffix(word, "s") && len(word) > 2 {
		// Check for common plural endings
		if strings.HasSuffix(word, "ies") && len(word) > 3 {
			return word[:len(word)-3] + "y"
		} else if strings.HasSuffix(word, "es") && len(word) > 2 {
			return word[:len(word)-2]
		} else if strings.HasSuffix(word, "s") && len(word) > 1 {
			return word[:len(word)-1]
		}
	}

	// Check for verb forms
	if strings.HasSuffix(word, "ing") && len(word) > 3 {
		// Check for doubled consonant
		if len(word) > 4 && word[len(word)-4] == word[len(word)-5] {
			return word[:len(word)-4]
		}
		return word[:len(word)-3]
	} else if strings.HasSuffix(word, "ed") && len(word) > 2 {
		// Check for doubled consonant
		if len(word) > 3 && word[len(word)-3] == word[len(word)-4] {
			return word[:len(word)-3]
		}
		return word[:len(word)-2]
	}

	// Return the original word if no rules apply
	return word
}

// spanishLemma finds the base form of a Spanish word
func spanishLemma(word string) string {
	// Simplified implementation for Spanish
	// In a real-world scenario, you would use a more comprehensive approach

	// Check for verb endings
	if strings.HasSuffix(word, "ar") || strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") {
		return word
	} else if strings.HasSuffix(word, "ando") || strings.HasSuffix(word, "endo") {
		return word[:len(word)-4]
	} else if strings.HasSuffix(word, "ado") || strings.HasSuffix(word, "ido") {
		return word[:len(word)-3]
	}

	// Check for plural nouns
	if strings.HasSuffix(word, "es") && len(word) > 2 {
		return word[:len(word)-2]
	} else if strings.HasSuffix(word, "s") && len(word) > 1 {
		return word[:len(word)-1]
	}

	// Return the original word if no rules apply
	return word
}

// frenchLemma finds the base form of a French word
func frenchLemma(word string) string {
	// Simplified implementation for French
	// In a real-world scenario, you would use a more comprehensive approach

	// Check for verb endings
	if strings.HasSuffix(word, "er") || strings.HasSuffix(word, "ir") || strings.HasSuffix(word, "re") {
		return word
	} else if strings.HasSuffix(word, "ant") || strings.HasSuffix(word, "ent") {
		return word[:len(word)-3]
	} else if strings.HasSuffix(word, "é") || strings.HasSuffix(word, "i") {
		return word[:len(word)-1]
	}

	// Check for plural nouns
	if strings.HasSuffix(word, "s") && len(word) > 1 {
		return word[:len(word)-1]
	} else if strings.HasSuffix(word, "x") && len(word) > 1 {
		return word[:len(word)-1]
	}

	// Return the original word if no rules apply
	return word
}