package linguistics import ( "strings" "unicode" ) // Tokenizer splits text into tokens type Tokenizer struct{} // NewTokenizer creates a new Tokenizer func NewTokenizer() *Tokenizer { return &Tokenizer{} } // Tokenize splits text into tokens and returns them func (t *Tokenizer) Tokenize(text Text) ([]Token, error) { // This is a simplified implementation // In a real-world scenario, you would use a library like github.com/jdkato/prose // or call an external API for tokenization content := text.Body var tokens []Token // Split by whitespace first words := strings.Fields(content) offset := 0 for position, word := range words { // Skip empty words if len(word) == 0 { continue } // Find the offset of this word in the original text wordOffset := strings.Index(content[offset:], word) + offset // Clean the word by removing punctuation at the beginning and end cleanWord := strings.TrimFunc(word, func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsNumber(r) }) // Skip empty words after cleaning if len(cleanWord) == 0 { offset = wordOffset + len(word) continue } // Create a token token := Token{ Text: cleanWord, Position: position, Offset: wordOffset, Length: len(cleanWord), } tokens = append(tokens, token) offset = wordOffset + len(word) } return tokens, nil }