tercul-backend/internal/enrich/tokenizer.go

package enrich

import (
	"strings"
	"unicode"
)

// Tokenizer splits text into tokens
type Tokenizer struct{}

// NewTokenizer creates a new Tokenizer
func NewTokenizer() *Tokenizer {
	return &Tokenizer{}
}

// Tokenize splits text into tokens and returns them
func (t *Tokenizer) Tokenize(text Text) ([]Token, error) {
	// This is a simplified implementation
	// In a real-world scenario, you would use a library like github.com/jdkato/prose
	// or call an external API for tokenization

	content := text.Body
	var tokens []Token

	// Split by whitespace first
	words := strings.Fields(content)
	offset := 0

	for position, word := range words {
		// Skip empty words
		if len(word) == 0 {
			continue
		}

		// Find the offset of this word in the original text
		wordOffset := strings.Index(content[offset:], word) + offset

		// Clean the word by removing punctuation at the beginning and end
		cleanWord := strings.TrimFunc(word, func(r rune) bool {
			return !unicode.IsLetter(r) && !unicode.IsNumber(r)
		})

		// Skip empty words after cleaning
		if len(cleanWord) == 0 {
			offset = wordOffset + len(word)
			continue
		}

		// Create a token
		token := Token{
			Text:     cleanWord,
			Position: position,
			Offset:   wordOffset,
			Length:   len(cleanWord),
		}

		tokens = append(tokens, token)
		offset = wordOffset + len(word)
	}

	return tokens, nil
}