forked from mlc-ai/tokenizers-cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.cc
More file actions
148 lines (115 loc) · 4.66 KB
/
example.cc
File metadata and controls
148 lines (115 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include <tokenizers_cpp.h>
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
using tokenizers::Tokenizer;
std::string LoadBytesFromFile(const std::string& path) {
std::ifstream fs(path, std::ios::in | std::ios::binary);
if (fs.fail()) {
std::cerr << "Cannot open " << path << std::endl;
exit(1);
}
std::string data;
fs.seekg(0, std::ios::end);
size_t size = static_cast<size_t>(fs.tellg());
fs.seekg(0, std::ios::beg);
data.resize(size);
fs.read(data.data(), size);
return data;
}
void PrintEncodeResult(const std::vector<int>& ids) {
std::cout << "tokens=[";
for (size_t i = 0; i < ids.size(); ++i) {
if (i != 0) std::cout << ", ";
std::cout << ids[i];
}
std::cout << "]" << std::endl;
}
void TestTokenizer(std::unique_ptr<Tokenizer> tok, bool print_vocab = false,
bool check_id_back = true) {
// Check #1. Encode and Decode
std::string prompt = "What is the capital of Canada?";
std::vector<int> ids = tok->Encode(prompt);
std::string decoded_prompt = tok->Decode(ids);
PrintEncodeResult(ids);
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
assert(decoded_prompt == prompt);
// Check #2. IdToToken and TokenToId
std::vector<int32_t> ids_to_test = {0, 1, 2, 3, 32, 33, 34, 130, 131, 1000};
for (auto id : ids_to_test) {
auto token = tok->IdToToken(id);
auto id_new = tok->TokenToId(token);
std::cout << "id=" << id << ", token=\"" << token << "\", id_new=" << id_new << std::endl;
if (check_id_back) {
assert(id == id_new);
}
}
// Check #3. GetVocabSize
auto vocab_size = tok->GetVocabSize();
std::cout << "vocab_size=" << vocab_size << std::endl;
std::cout << std::endl;
}
// Sentencepiece tokenizer
// - dist/tokenizer.model
void SentencePieceTokenizerExample() {
std::cout << "Tokenizer: SentencePiece" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
// Read blob from file.
auto blob = LoadBytesFromFile("dist/tokenizer.model");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobSentencePiece(blob);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
TestTokenizer(std::move(tok), false, true);
}
// HF tokenizer
// - dist/tokenizer.json
void HuggingFaceTokenizerExample() {
std::cout << "Tokenizer: Huggingface" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
// Read blob from file.
auto blob = LoadBytesFromFile("dist/tokenizer.json");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobJSON(blob);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
TestTokenizer(std::move(tok), false, true);
}
void HuggingFaceBPETokenizerExample() {
std::cout << "Tokenizer: Huggingface BPE" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
// Read blob from file.
auto vocab_blob = LoadBytesFromFile("dist/vocab.json");
auto merges_blob = LoadBytesFromFile("dist/merges.txt");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobByteLevelBPE(vocab_blob, merges_blob);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
TestTokenizer(std::move(tok), false, true);
}
// RWKV world tokenizer
// - dist/tokenizer_model
void RWKVWorldTokenizerExample() {
std::cout << "Tokenizer: RWKVWorld" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
auto tok = Tokenizer::FromBlobRWKVWorld("dist/tokenizer_model");
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
// We cannot check id back for RWKVWorldTokenizer yet.
TestTokenizer(std::move(tok), false, false);
}
int main(int argc, char* argv[]) {
SentencePieceTokenizerExample();
HuggingFaceTokenizerExample();
HuggingFaceBPETokenizerExample();
RWKVWorldTokenizerExample();
}