mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
- Core Go application with GraphQL API using gqlgen - Comprehensive data models for literary works, authors, translations - Repository pattern with caching layer - Authentication and authorization system - Linguistics analysis capabilities with multiple adapters - Vector search integration with Weaviate - Docker containerization support - Python data migration and analysis scripts - Clean architecture with proper separation of concerns - Production-ready configuration and middleware - Proper .gitignore excluding vendor/, database files, and build artifacts
63 lines
1.3 KiB
Go
63 lines
1.3 KiB
Go
package enrich
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// Tokenizer splits text into tokens
|
|
type Tokenizer struct{}
|
|
|
|
// NewTokenizer creates a new Tokenizer
|
|
func NewTokenizer() *Tokenizer {
|
|
return &Tokenizer{}
|
|
}
|
|
|
|
// Tokenize splits text into tokens and returns them
|
|
func (t *Tokenizer) Tokenize(text Text) ([]Token, error) {
|
|
// This is a simplified implementation
|
|
// In a real-world scenario, you would use a library like github.com/jdkato/prose
|
|
// or call an external API for tokenization
|
|
|
|
content := text.Body
|
|
var tokens []Token
|
|
|
|
// Split by whitespace first
|
|
words := strings.Fields(content)
|
|
offset := 0
|
|
|
|
for position, word := range words {
|
|
// Skip empty words
|
|
if len(word) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Find the offset of this word in the original text
|
|
wordOffset := strings.Index(content[offset:], word) + offset
|
|
|
|
// Clean the word by removing punctuation at the beginning and end
|
|
cleanWord := strings.TrimFunc(word, func(r rune) bool {
|
|
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
|
|
})
|
|
|
|
// Skip empty words after cleaning
|
|
if len(cleanWord) == 0 {
|
|
offset = wordOffset + len(word)
|
|
continue
|
|
}
|
|
|
|
// Create a token
|
|
token := Token{
|
|
Text: cleanWord,
|
|
Position: position,
|
|
Offset: wordOffset,
|
|
Length: len(cleanWord),
|
|
}
|
|
|
|
tokens = append(tokens, token)
|
|
offset = wordOffset + len(word)
|
|
}
|
|
|
|
return tokens, nil
|
|
}
|