package enrich import "testing" func TestTokenizer_Basic(t *testing.T) { text := Text{Body: "Hello, world! Go1 is great."} tok := NewTokenizer() tokens, err := tok.Tokenize(text) if err != nil { t.Fatalf("Tokenize returned error: %v", err) } expected := []string{"Hello", "world", "Go1", "is", "great"} if len(tokens) != len(expected) { t.Fatalf("expected %d tokens, got %d: %#v", len(expected), len(tokens), tokens) } for i, e := range expected { if tokens[i].Text != e { t.Errorf("token %d text: expected %q, got %q", i, e, tokens[i].Text) } if tokens[i].Position != i { t.Errorf("token %d position: expected %d, got %d", i, i, tokens[i].Position) } if tokens[i].Length != len(e) { t.Errorf("token %d length: expected %d, got %d", i, len(e), tokens[i].Length) } } } func TestTokenizer_UnicodeAndPunctuation(t *testing.T) { text := Text{Body: "Привет, мир! — hello?"} tok := NewTokenizer() tokens, err := tok.Tokenize(text) if err != nil { t.Fatalf("Tokenize returned error: %v", err) } expected := []string{"Привет", "мир", "hello"} if len(tokens) != len(expected) { t.Fatalf("expected %d tokens, got %d: %#v", len(expected), len(tokens), tokens) } for i, e := range expected { if tokens[i].Text != e { t.Errorf("token %d text: expected %q, got %q", i, e, tokens[i].Text) } } } func TestTokenizer_Empty(t *testing.T) { tok := NewTokenizer() tokens, err := tok.Tokenize(Text{Body: " \t\n "}) if err != nil { t.Fatalf("Tokenize returned error: %v", err) } if len(tokens) != 0 { t.Fatalf("expected 0 tokens for whitespace-only input, got %d", len(tokens)) } }