tercul-backend/internal/enrich/tokenizer.go
Damir Mukimov 4957117cb6 Initial commit: Tercul Go project with comprehensive architecture
- Core Go application with GraphQL API using gqlgen
- Comprehensive data models for literary works, authors, translations
- Repository pattern with caching layer
- Authentication and authorization system
- Linguistics analysis capabilities with multiple adapters
- Vector search integration with Weaviate
- Docker containerization support
- Python data migration and analysis scripts
- Clean architecture with proper separation of concerns
- Production-ready configuration and middleware
- Proper .gitignore excluding vendor/, database files, and build artifacts
2025-08-13 07:42:32 +02:00

63 lines
1.3 KiB
Go

package enrich
import (
"strings"
"unicode"
)
// Tokenizer splits text into tokens
type Tokenizer struct{}
// NewTokenizer creates a new Tokenizer
func NewTokenizer() *Tokenizer {
return &Tokenizer{}
}
// Tokenize splits text into tokens and returns them
func (t *Tokenizer) Tokenize(text Text) ([]Token, error) {
// This is a simplified implementation
// In a real-world scenario, you would use a library like github.com/jdkato/prose
// or call an external API for tokenization
content := text.Body
var tokens []Token
// Split by whitespace first
words := strings.Fields(content)
offset := 0
for position, word := range words {
// Skip empty words
if len(word) == 0 {
continue
}
// Find the offset of this word in the original text
wordOffset := strings.Index(content[offset:], word) + offset
// Clean the word by removing punctuation at the beginning and end
cleanWord := strings.TrimFunc(word, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
// Skip empty words after cleaning
if len(cleanWord) == 0 {
offset = wordOffset + len(word)
continue
}
// Create a token
token := Token{
Text: cleanWord,
Position: position,
Offset: wordOffset,
Length: len(cleanWord),
}
tokens = append(tokens, token)
offset = wordOffset + len(word)
}
return tokens, nil
}