Yume
token.cpp
Go to the documentation of this file.
1#include "token.hpp"
2#include <algorithm>
3#include <cctype>
4#include <cstddef>
5#include <functional>
6#include <initializer_list>
7#include <iterator>
8#include <llvm/ADT/StringExtras.h>
9#include <llvm/Support/Format.h>
10#include <llvm/Support/raw_ostream.h>
11#include <sstream>
12#include <stdexcept>
13#include <tuple>
14#include <type_traits>
15#include <utility>
16#include <vector>
17
18namespace yume {
19using char_raw_fn = bool(char);
20struct TokenState {
21 bool valid;
22 char c;
23 size_t index;
24 llvm::raw_string_ostream& stream;
25
26 auto validate(bool val = true) -> bool {
27 valid |= val;
28 return val;
29 }
30
31 auto accept(bool ok) -> bool {
32 if (ok)
33 stream.write(c);
34 return ok;
35 }
36
37 auto accept() -> bool {
38 stream.write(c);
39 return true;
40 }
41
42 auto accept(char chr) -> bool { return accept(c == chr); }
43 auto accept_not(char chr) -> bool { return accept(c != chr); }
44 auto accept(char_raw_fn fn) -> bool { return accept(fn(c)); }
45
46 auto accept_validate(auto x) -> bool { return validate(accept(x)); }
47};
48
49template <typename T>
50concept char_fn = requires(T t, TokenState& state) {
51 { t(state) } -> std::same_as<bool>;
52 };
53
54/// Contains the state while the tokenizer is running, such as the position within the file currently being read
55class Tokenizer {
56 vector<Token> m_tokens{};
57 std::istream& m_in;
58 char m_last;
59 bool m_error_state = false;
60 int m_count{};
61 int m_line = 1;
62 int m_col = 1;
63 int m_begin_line = 1;
64 int m_begin_col = 1;
65 const char* m_source_file;
66 std::string m_stream_buffer;
67
68 static auto unescape(char c) -> char {
69 switch (c) {
70 case '0': return '\x00';
71 case 'n': return '\n';
72 case 'r': return '\r';
73 case 't': return '\t';
74 default: return c;
75 }
76 }
77
78public:
79 /// Words consist of alphanumeric characters, or underscores, but *must* begin with a letter.
80 constexpr static const auto is_word = [](TokenState& state) {
81 if (state.index == 0)
82 return state.accept_validate(llvm::isAlpha(state.c) || state.c == '_');
83 return state.accept_validate(llvm::isAlnum(state.c) || state.c == '_');
84 };
85
86 /// Strings are delimited by double quotes `"` and may contain escapes.
87 constexpr static const auto is_str = [end = false, escape = false](TokenState& state) mutable {
88 if (end)
89 return false;
90
91 if (state.index == 0)
92 return state.c == '"';
93
94 if (state.c == '\\' && !escape) {
95 escape = true;
96 } else if (state.c == '"' && !escape && !end) {
97 end = true;
98 state.validate();
99 } else if (escape) {
100 state.stream.write(unescape(state.c));
101 escape = false;
102 } else {
103 state.stream.write(state.c);
104 }
105 return true;
106 };
107
108 /// Character literals begin with a question mark `?` and may contain escapes.
109 constexpr static const auto is_char_lit = [escape = false](TokenState& state) mutable {
110 if (state.index == 0)
111 return state.c == '?';
112
113 if (state.index == 1) {
114 if (state.c == '\\')
115 escape = true;
116 else
117 state.stream.write(state.c);
118 return state.validate();
119 }
120 if (state.index == 2 && escape) {
121 state.stream.write(unescape(state.c));
122 return state.validate();
123 }
124 return false;
125 };
126
127 /// Comments begin with an octothorpe `#` and last until the end of the line.
128 constexpr static const auto is_comment = [](TokenState& state) {
129 if (state.index == 0)
130 return state.accept_validate('#');
131 return state.accept_validate(state.c != '\n');
132 };
133
134 /// This matches both regular numbers (0-9), and hex number. Hex numbers begin with `0x`, and consist of any of 0-9,
135 /// a-f or A-F. If the first character is a 0, is is ambiguous and must be checked further.
136 constexpr static const auto is_num_or_hex_num = [possibly_hex = false](TokenState& state) mutable {
137 if (state.index == 0 && state.c == '0') {
138 possibly_hex = true;
139 return state.accept_validate(true);
140 }
141 if (possibly_hex && state.index == 1) {
142 if (state.c == 'x') {
143 // Invalidate, since we need a character after the x
144 state.valid = false;
145 return state.accept();
146 }
147 possibly_hex = false;
148 }
149 if (possibly_hex)
150 return state.accept_validate(llvm::isHexDigit);
151 return state.accept_validate(llvm::isDigit);
152 };
153
154 /// Generate a criterion matching a single character from any within the string `checks`.
155 constexpr static const auto is_any_of = [](string_view checks) {
156 return [checks](TokenState& state) {
157 return state.index == 0 && state.accept_validate(checks.find(state.c) != string::npos);
158 };
159 };
160
161 /// Generate a criterion matching one or both of the character
162 constexpr static const auto is_partial = [](char c1, char c2) {
163 return [c1, c2](TokenState& state) {
164 if (state.index == 0)
165 return state.accept_validate(c1);
166 if (state.index == 1)
167 return state.accept_validate(c2);
168 return false;
169 };
170 };
171
172 /// Generate a criterion matching the singular character.
173 constexpr static const auto is_char = [](char chr) {
174 return [chr](TokenState& state) { return state.index == 0 && state.accept_validate(chr); };
175 };
176
177 void tokenize() {
178
179 while (!m_in.eof()) {
180 m_begin_line = m_line;
181 m_begin_col = m_col;
182 // m_begin_last = m_last;
183 // m_begin_position = m_in.tellg();
184
185 if (check_characteristic(Token::Type::Separator, is_char('\n')) ||
186 check_characteristic(Token::Type::Skip, llvm::isSpace) ||
187 check_characteristic(Token::Type::Skip, is_comment) ||
188 check_characteristic(Token::Type::Number, is_num_or_hex_num) ||
189 check_characteristic(Token::Type::Literal, is_str) || //
190 check_characteristic(Token::Type::Char, is_char_lit) || //
191 check_characteristic(Token::Type::Word, is_word) ||
192 check_characteristic(Token::Type::Symbol, is_partial('=', '=')) || // = and ==
193 check_characteristic(Token::Type::Symbol, is_partial('!', '=')) || // ! and !=
194 check_characteristic(Token::Type::Symbol, is_partial('/', '/')) || // / and //
195 check_characteristic(Token::Type::Symbol, is_partial(':', ':')) || // : and ::
196 check_characteristic(Token::Type::Symbol, is_partial('-', '>')) || // - and ->
197 check_characteristic(Token::Type::Symbol, is_partial('|', '|')) || // | and ||
198 check_characteristic(Token::Type::Symbol, is_partial('&', '&')) || // & and &&
199 check_characteristic(Token::Type::Symbol, is_any_of(R"(()[]{}<>%+.,*@$)"))) {
200
201 if (!m_error_state)
202 continue;
203 }
204 std::stringstream msg;
205 msg << "Tokenizer didn't recognize '" << m_last << "' at " << m_source_file << ":" << m_line << ":" << m_col;
206 throw std::runtime_error(msg.str());
207 m_count++;
208 }
209
210 m_tokens.emplace_back(Token::Type::EndOfFile, std::nullopt, m_count,
211 Loc{m_line, m_col, m_line, m_col, m_source_file});
212 }
213
214 Tokenizer(std::istream& in, const char* source_file) : m_in(in), m_last(next()), m_source_file(source_file) {}
215
216 [[nodiscard]] auto tokens() { return m_tokens; }
217
218private:
219 auto next() -> char {
220 m_in.get(m_last);
221 if (m_last == '\n') {
222 m_line++;
223 m_col = 0;
224 } else {
225 m_col++;
226 }
227 return m_last;
228 }
229
230 /// Determine if the criterion is viable with the current character as the first character, then consume tokens
231 /// until the criterion becomes false. The result is appended to the current list of tokens `m_tokens`.
232 ///
233 /// \returns `true` if the first character is ok
234 auto check_characteristic(Token::Type type, char_fn auto fn) -> bool {
235 m_stream_buffer.clear();
236 auto stream = llvm::raw_string_ostream{m_stream_buffer};
237 auto state = TokenState{false, m_last, 0, stream};
238 if (fn(state)) {
239 auto [atom, end_line, end_col] = consume_characteristic(fn, state);
240 if (state.valid) {
241 m_tokens.emplace_back(type, atom, m_count, Loc{m_begin_line, m_begin_col, end_line, end_col, m_source_file});
242 return true;
243 }
244
245 m_error_state = true;
246 return true;
247 }
248 return false;
249 }
250
251 auto check_characteristic(Token::Type type, char_raw_fn* fn) -> bool {
252 return check_characteristic(type, [fn](TokenState& state) { return state.accept_validate(fn); });
253 }
254
255 /// Consume characters until the criterion becomes false. Note that the first character is assumed to already be
256 /// matched.
257 /// \returns `Atom` containing the payload of the matched token, and the line and col number it stopped on.
258 auto consume_characteristic(char_fn auto fn, TokenState& state) -> std::tuple<Atom, int, int> {
259 state.index++;
260 int end_line = m_line;
261 int end_col = m_col;
262 next();
263 state.c = m_last;
264 while (!m_in.eof() && fn(state)) {
265 state.index++;
266 end_line = m_line;
267 end_col = m_col;
268 next();
269 state.c = m_last;
270 }
271
272 return {make_atom(state.stream.str()), end_line, end_col};
273 }
274};
275
276auto tokenize_preserve_skipped(std::istream& in, const string& source_file) -> vector<Token> {
277 auto tokenizer = Tokenizer(in, source_file.data());
278 tokenizer.tokenize();
279 return tokenizer.tokens();
280}
281
282auto tokenize(std::istream& in, const string& source_file) -> vector<Token> {
283 vector<Token> original = tokenize_preserve_skipped(in, source_file);
284 vector<Token> filtered{};
285 filtered.reserve(original.size());
286 std::copy_if(original.begin(), original.end(), std::back_inserter(filtered),
287 [](const Token& t) { return t.type != Token::Type::Skip; });
288 return filtered;
289}
290
291auto operator<<(llvm::raw_ostream& os, const Token& token) -> llvm::raw_ostream& {
292 os << "Token" << llvm::format_decimal(token.index, 4) << '(';
293 const auto& loc = token.loc;
294 os << loc.to_string() << ",";
295 os << Token::type_name(token.type);
296 if (token.payload.has_value()) {
297 os << ",\"";
298 os.write_escaped(string(*token.payload));
299 os << '\"';
300 }
301 os << ")";
302 return os;
303}
304} // namespace yume
Contains the state while the tokenizer is running, such as the position within the file currently bei...
Definition: token.cpp:55
void tokenize()
Definition: token.cpp:177
static constexpr const auto is_char
Generate a criterion matching the singular character.
Definition: token.cpp:173
static constexpr const auto is_partial
Generate a criterion matching one or both of the character.
Definition: token.cpp:162
static constexpr const auto is_word
Words consist of alphanumeric characters, or underscores, but must begin with a letter.
Definition: token.cpp:80
auto tokens()
Definition: token.cpp:216
static constexpr const auto is_str
Strings are delimited by double quotes " and may contain escapes.
Definition: token.cpp:87
static constexpr const auto is_char_lit
Character literals begin with a question mark ? and may contain escapes.
Definition: token.cpp:109
Tokenizer(std::istream &in, const char *source_file)
Definition: token.cpp:214
static constexpr const auto is_num_or_hex_num
This matches both regular numbers (0-9), and hex number. Hex numbers begin with 0x,...
Definition: token.cpp:136
static constexpr const auto is_comment
Comments begin with an octothorpe # and last until the end of the line.
Definition: token.cpp:128
static constexpr const auto is_any_of
Generate a criterion matching a single character from any within the string checks.
Definition: token.cpp:155
string_view end
Definition: errors.cpp:42
Definition: ast.cpp:8
auto make_atom(std::string_view value) noexcept -> Atom
Create an Atom with the given string content.
Definition: atom.hpp:34
bool(char) char_raw_fn
Definition: token.cpp:19
auto tokenize_preserve_skipped(std::istream &in, const string &source_file) -> vector< Token >
Consume the contents of the input stream and create corresponding tokens, preserving every token,...
Definition: token.cpp:276
auto operator<<(llvm::raw_ostream &os, const Token &token) -> llvm::raw_ostream &
Definition: token.cpp:291
auto tokenize(std::istream &in, const string &source_file) -> vector< Token >
Consume the contents of the input stream and create corresponding tokens, ignoring insignificant whit...
Definition: token.cpp:282
Represents a location in source code, as a range starting at a line and column and ending at some oth...
Definition: token.hpp:26
auto accept() -> bool
Definition: token.cpp:37
auto accept_not(char chr) -> bool
Definition: token.cpp:43
auto accept(bool ok) -> bool
Definition: token.cpp:31
auto validate(bool val=true) -> bool
Definition: token.cpp:26
llvm::raw_string_ostream & stream
Definition: token.cpp:24
auto accept(char chr) -> bool
Definition: token.cpp:42
size_t index
Definition: token.cpp:23
auto accept_validate(auto x) -> bool
Definition: token.cpp:46
auto accept(char_raw_fn fn) -> bool
Definition: token.cpp:44
A categorized token in source code, created by the tokenizer. These tokens are consumed by the lexer.
Definition: token.hpp:80
@ Symbol
Special characters, such as those representing operators.
@ Separator
A newline or a semicolon ;
@ Word
Any form of keyword or identifier, essentially the "default" token type.
@ Skip
Tokens which should be ignored, i.e. insignificant whitespace.
@ Char
A character literal, beginning with ?
@ EndOfFile
A token added at the very end of the file.
@ Number
A number literal.
@ Literal
A string literal, enclosed in quotes.
static auto constexpr type_name(Type type) -> const char *
Definition: token.hpp:91