tercul-backend/internal/enrich/tokenizer_test.go
Damir Mukimov fa336cacf3
wip
2025-09-01 00:43:59 +02:00

59 lines
1.6 KiB
Go

package enrich
import "testing"
func TestTokenizer_Basic(t *testing.T) {
text := Text{Body: "Hello, world! Go1 is great."}
tok := NewTokenizer()
tokens, err := tok.Tokenize(text)
if err != nil {
t.Fatalf("Tokenize returned error: %v", err)
}
expected := []string{"Hello", "world", "Go1", "is", "great"}
if len(tokens) != len(expected) {
t.Fatalf("expected %d tokens, got %d: %#v", len(expected), len(tokens), tokens)
}
for i, e := range expected {
if tokens[i].Text != e {
t.Errorf("token %d text: expected %q, got %q", i, e, tokens[i].Text)
}
if tokens[i].Position != i {
t.Errorf("token %d position: expected %d, got %d", i, i, tokens[i].Position)
}
if tokens[i].Length != len(e) {
t.Errorf("token %d length: expected %d, got %d", i, len(e), tokens[i].Length)
}
}
}
func TestTokenizer_UnicodeAndPunctuation(t *testing.T) {
text := Text{Body: "Привет, мир! — hello?"}
tok := NewTokenizer()
tokens, err := tok.Tokenize(text)
if err != nil {
t.Fatalf("Tokenize returned error: %v", err)
}
expected := []string{"Привет", "мир", "hello"}
if len(tokens) != len(expected) {
t.Fatalf("expected %d tokens, got %d: %#v", len(expected), len(tokens), tokens)
}
for i, e := range expected {
if tokens[i].Text != e {
t.Errorf("token %d text: expected %q, got %q", i, e, tokens[i].Text)
}
}
}
func TestTokenizer_Empty(t *testing.T) {
tok := NewTokenizer()
tokens, err := tok.Tokenize(Text{Body: " \t\n "})
if err != nil {
t.Fatalf("Tokenize returned error: %v", err)
}
if len(tokens) != 0 {
t.Fatalf("expected 0 tokens for whitespace-only input, got %d", len(tokens))
}
}