6#include <initializer_list>
8#include <llvm/ADT/StringExtras.h>
9#include <llvm/Support/Format.h>
10#include <llvm/Support/raw_ostream.h>
50concept char_fn =
requires(T t, TokenState& state) {
51 { t(state) } -> std::same_as<bool>;
56 vector<Token> m_tokens{};
59 bool m_error_state =
false;
65 const char* m_source_file;
66 std::string m_stream_buffer;
68 static auto unescape(
char c) ->
char {
70 case '0':
return '\x00';
71 case 'n':
return '\n';
72 case 'r':
return '\r';
73 case 't':
return '\t';
82 return state.accept_validate(llvm::isAlpha(state.c) || state.c ==
'_');
83 return state.accept_validate(llvm::isAlnum(state.c) || state.c ==
'_');
92 return state.c ==
'"';
94 if (state.c ==
'\\' && !escape) {
96 }
else if (state.c ==
'"' && !escape && !
end) {
100 state.stream.write(unescape(state.c));
103 state.stream.write(state.c);
110 if (state.index == 0)
111 return state.c ==
'?';
113 if (state.index == 1) {
117 state.stream.write(state.c);
118 return state.validate();
120 if (state.index == 2 && escape) {
121 state.stream.write(unescape(state.c));
122 return state.validate();
129 if (state.index == 0)
130 return state.accept_validate(
'#');
131 return state.accept_validate(state.c !=
'\n');
137 if (state.index == 0 && state.c ==
'0') {
139 return state.accept_validate(
true);
141 if (possibly_hex && state.index == 1) {
142 if (state.c ==
'x') {
145 return state.accept();
147 possibly_hex =
false;
150 return state.accept_validate(llvm::isHexDigit);
151 return state.accept_validate(llvm::isDigit);
155 constexpr static const auto is_any_of = [](string_view checks) {
157 return state.index == 0 && state.accept_validate(checks.find(state.c) != string::npos);
162 constexpr static const auto is_partial = [](
char c1,
char c2) {
164 if (state.index == 0)
165 return state.accept_validate(c1);
166 if (state.index == 1)
167 return state.accept_validate(c2);
173 constexpr static const auto is_char = [](
char chr) {
174 return [chr](
TokenState& state) {
return state.index == 0 && state.accept_validate(chr); };
179 while (!m_in.eof()) {
180 m_begin_line = m_line;
204 std::stringstream msg;
205 msg <<
"Tokenizer didn't recognize '" << m_last <<
"' at " << m_source_file <<
":" << m_line <<
":" << m_col;
206 throw std::runtime_error(msg.str());
211 Loc{m_line, m_col, m_line, m_col, m_source_file});
214 Tokenizer(std::istream& in,
const char* source_file) : m_in(in), m_last(next()), m_source_file(source_file) {}
216 [[nodiscard]]
auto tokens() {
return m_tokens; }
219 auto next() ->
char {
221 if (m_last ==
'\n') {
234 auto check_characteristic(
Token::Type type, char_fn
auto fn) ->
bool {
235 m_stream_buffer.clear();
236 auto stream = llvm::raw_string_ostream{m_stream_buffer};
237 auto state = TokenState{
false, m_last, 0, stream};
239 auto [atom, end_line, end_col] = consume_characteristic(fn, state);
241 m_tokens.emplace_back(type, atom, m_count, Loc{m_begin_line, m_begin_col, end_line, end_col, m_source_file});
245 m_error_state =
true;
252 return check_characteristic(type, [fn](TokenState& state) {
return state.accept_validate(fn); });
258 auto consume_characteristic(char_fn
auto fn, TokenState& state) -> std::tuple<Atom, int, int> {
260 int end_line = m_line;
264 while (!m_in.eof() && fn(state)) {
272 return {
make_atom(state.stream.str()), end_line, end_col};
277 auto tokenizer =
Tokenizer(in, source_file.data());
278 tokenizer.tokenize();
279 return tokenizer.tokens();
282auto tokenize(std::istream& in,
const string& source_file) -> vector<Token> {
284 vector<Token> filtered{};
285 filtered.reserve(original.size());
286 std::copy_if(original.begin(), original.end(), std::back_inserter(filtered),
287 [](
const Token& t) { return t.type != Token::Type::Skip; });
292 os <<
"Token" << llvm::format_decimal(token.index, 4) <<
'(';
293 const auto& loc = token.loc;
294 os << loc.to_string() <<
",";
296 if (token.payload.has_value()) {
298 os.write_escaped(
string(*token.payload));
Contains the state while the tokenizer is running, such as the position within the file currently bei...
static constexpr const auto is_char
Generate a criterion matching the singular character.
static constexpr const auto is_partial
Generate a criterion matching one or both of the character.
static constexpr const auto is_word
Words consist of alphanumeric characters, or underscores, but must begin with a letter.
static constexpr const auto is_str
Strings are delimited by double quotes " and may contain escapes.
static constexpr const auto is_char_lit
Character literals begin with a question mark ? and may contain escapes.
Tokenizer(std::istream &in, const char *source_file)
static constexpr const auto is_num_or_hex_num
This matches both regular numbers (0-9), and hex number. Hex numbers begin with 0x,...
static constexpr const auto is_comment
Comments begin with an octothorpe # and last until the end of the line.
static constexpr const auto is_any_of
Generate a criterion matching a single character from any within the string checks.
auto make_atom(std::string_view value) noexcept -> Atom
Create an Atom with the given string content.
auto tokenize_preserve_skipped(std::istream &in, const string &source_file) -> vector< Token >
Consume the contents of the input stream and create corresponding tokens, preserving every token,...
auto operator<<(llvm::raw_ostream &os, const Token &token) -> llvm::raw_ostream &
auto tokenize(std::istream &in, const string &source_file) -> vector< Token >
Consume the contents of the input stream and create corresponding tokens, ignoring insignificant whit...
Represents a location in source code, as a range starting at a line and column and ending at some oth...
auto accept_not(char chr) -> bool
auto accept(bool ok) -> bool
auto validate(bool val=true) -> bool
llvm::raw_string_ostream & stream
auto accept(char chr) -> bool
auto accept_validate(auto x) -> bool
auto accept(char_raw_fn fn) -> bool
A categorized token in source code, created by the tokenizer. These tokens are consumed by the lexer.
@ Symbol
Special characters, such as those representing operators.
@ Separator
A newline or a semicolon ;
@ Word
Any form of keyword or identifier, essentially the "default" token type.
@ Skip
Tokens which should be ignored, i.e. insignificant whitespace.
@ Char
A character literal, beginning with ?
@ EndOfFile
A token added at the very end of the file.
@ Number
A number literal.
@ Literal
A string literal, enclosed in quotes.
static auto constexpr type_name(Type type) -> const char *