mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
59 lines
1.6 KiB
Go
59 lines
1.6 KiB
Go
package linguistics
|
|
|
|
import "testing"
|
|
|
|
func TestTokenizer_Basic(t *testing.T) {
|
|
text := Text{Body: "Hello, world! Go1 is great."}
|
|
tok := NewTokenizer()
|
|
tokens, err := tok.Tokenize(text)
|
|
if err != nil {
|
|
t.Fatalf("Tokenize returned error: %v", err)
|
|
}
|
|
|
|
expected := []string{"Hello", "world", "Go1", "is", "great"}
|
|
if len(tokens) != len(expected) {
|
|
t.Fatalf("expected %d tokens, got %d: %#v", len(expected), len(tokens), tokens)
|
|
}
|
|
for i, e := range expected {
|
|
if tokens[i].Text != e {
|
|
t.Errorf("token %d text: expected %q, got %q", i, e, tokens[i].Text)
|
|
}
|
|
if tokens[i].Position != i {
|
|
t.Errorf("token %d position: expected %d, got %d", i, i, tokens[i].Position)
|
|
}
|
|
if tokens[i].Length != len(e) {
|
|
t.Errorf("token %d length: expected %d, got %d", i, len(e), tokens[i].Length)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestTokenizer_UnicodeAndPunctuation(t *testing.T) {
|
|
text := Text{Body: "Привет, мир! — hello?"}
|
|
tok := NewTokenizer()
|
|
tokens, err := tok.Tokenize(text)
|
|
if err != nil {
|
|
t.Fatalf("Tokenize returned error: %v", err)
|
|
}
|
|
|
|
expected := []string{"Привет", "мир", "hello"}
|
|
if len(tokens) != len(expected) {
|
|
t.Fatalf("expected %d tokens, got %d: %#v", len(expected), len(tokens), tokens)
|
|
}
|
|
for i, e := range expected {
|
|
if tokens[i].Text != e {
|
|
t.Errorf("token %d text: expected %q, got %q", i, e, tokens[i].Text)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestTokenizer_Empty(t *testing.T) {
|
|
tok := NewTokenizer()
|
|
tokens, err := tok.Tokenize(Text{Body: " \t\n "})
|
|
if err != nil {
|
|
t.Fatalf("Tokenize returned error: %v", err)
|
|
}
|
|
if len(tokens) != 0 {
|
|
t.Fatalf("expected 0 tokens for whitespace-only input, got %d", len(tokens))
|
|
}
|
|
}
|