Yume
token.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "atom.hpp"
4#include "util.hpp"
5#include <algorithm>
6#include <optional>
7#include <sstream>
8#include <string>
9#include <string_view>
10#include <utility>
11#include <vector>
12
13namespace llvm {
14class raw_ostream;
15}
16
17namespace yume {
18
19/// Represents a location in source code, as a range starting at a line and column and ending at some other line and
20/// column of some file.
21/**
22 * Lines are 1-indexed, meaning a line number of 0 as either the beginning or end represents an unknown location.
23 * The range is inclusive, meaning a location representing just a single character would have its begin line and column
24 * be equal to its end line and column. There is no way to store a location representing "zero characters".
25 */
26struct Loc {
31 const char* file;
32
33 constexpr auto operator<=>(const Loc& other) const noexcept = default;
34
35 /// \brief Create a new location representing the "union" of two locations.
36 ///
37 /// The new location will have the beginning line and column of whichever location has a beginning earlier in the
38 /// file, and an ending line and column of whichever location has an ending later in the file.
39 constexpr auto operator+(const Loc& other) const noexcept -> Loc {
40 YUME_ASSERT(other.file == file, "Cannot add locations in different files");
41 auto [min_begin_line, min_begin_col] =
42 std::min(std::pair{begin_line, begin_col}, std::pair{other.begin_line, other.begin_col});
43 auto [max_end_line, max_end_col] = std::max(std::pair{end_line, end_col}, std::pair{other.end_line, other.end_col});
44 return Loc{min_begin_line, min_begin_col, max_end_line, max_end_col, file};
45 }
46
47 [[nodiscard]] auto to_string() const -> string {
48 stringstream ss{};
49 if (file != nullptr) {
50 if (auto filename = string{file}; filename.front() == '<' && filename.back() == '>') {
51 // This is a special "fake path", so we don't try to normalize it.
52 ss << filename;
53 } else {
54 ss << fs::path(file).stem().native();
55 }
56 }
57
58 if (!valid()) {
59 ss << ":?";
60 } else {
61 ss << ':' << begin_line << ':' << begin_col;
62 if (end_line != begin_line)
63 ss << ' ' << end_line << ':' << end_col;
64 else if (end_col != begin_col)
65 ss << " :" << end_col;
66 }
67 return ss.str();
68 }
69
70 [[nodiscard]] auto valid() const -> bool { return begin_line > 0 && end_line > 0; }
71
72 /// Return a new Loc which refers to the first character of the current Loc.
73 [[nodiscard]] auto single() const -> Loc { return {begin_line, begin_col, begin_line, begin_col, file}; }
74};
75
76/// A categorized token in source code, created by the tokenizer. These tokens are consumed by the lexer.
77/**
78 * Each token has a type, an associated payload (usually the text the token was created from) and a location \link Loc
79 */
80struct Token {
81 enum struct Type {
82 Word, ///< Any form of keyword or identifier, essentially the "default" token type
83 Skip, ///< Tokens which should be ignored, i.e. insignificant whitespace
84 Symbol, ///< Special characters, such as those representing operators
85 Literal, ///< A string literal, enclosed in quotes
86 Number, ///< A number literal
87 Char, ///< A character literal, beginning with `?`
88 Separator, ///< A newline or a semicolon `;`
89 EndOfFile ///< A token added at the very end of the file
90 };
91 static auto inline constexpr type_name(Type type) -> const char* {
92 using enum Token::Type;
93 switch (type) {
94 case Word: return "Word";
95 case Skip: return "Skip";
96 case Symbol: return "Symbol";
97 case Literal: return "Literal";
98 case Number: return "Number";
99 case Char: return "Char";
100 case Separator: return "Separator";
101 case EndOfFile: return "End of File";
102 }
103 }
104
105 using Payload = optional<Atom>;
106
109 int index = -1;
111
112 [[nodiscard]] auto is_a(const std::pair<Type, Atom>& type_atom) const -> bool {
113 return type == type_atom.first && payload == type_atom.second;
114 }
115
116 explicit constexpr Token(Type type) : type(type) {}
117 constexpr Token(Type type, Payload payload) noexcept : type(type), payload(payload) {}
118 constexpr Token(Type type, Payload payload, int i, Loc loc) noexcept
119 : type(type), payload(payload), index{i}, loc{loc} {}
120
121 friend auto operator<<(llvm::raw_ostream& os, const Token& token) -> llvm::raw_ostream&;
122};
123
124/// Consume the contents of the input stream and create corresponding tokens, preserving every token, including
125/// whitespace. This is usually undesired.
126/// \sa tokenize
127auto tokenize_preserve_skipped(std::istream& in, const string& source_file) -> vector<Token>;
128
129/// Consume the contents of the input stream and create corresponding tokens, ignoring insignificant whitespace
130auto tokenize(std::istream& in, const string& source_file) -> vector<Token>;
131} // namespace yume
Definition: ast.hpp:20
Definition: ast.cpp:8
auto tokenize_preserve_skipped(std::istream &in, const string &source_file) -> vector< Token >
Consume the contents of the input stream and create corresponding tokens, preserving every token,...
Definition: token.cpp:276
auto tokenize(std::istream &in, const string &source_file) -> vector< Token >
Consume the contents of the input stream and create corresponding tokens, ignoring insignificant whit...
Definition: token.cpp:282
Represents a location in source code, as a range starting at a line and column and ending at some oth...
Definition: token.hpp:26
int end_col
Definition: token.hpp:30
auto to_string() const -> string
Definition: token.hpp:47
auto valid() const -> bool
Definition: token.hpp:70
auto single() const -> Loc
Return a new Loc which refers to the first character of the current Loc.
Definition: token.hpp:73
constexpr auto operator<=>(const Loc &other) const noexcept=default
constexpr auto operator+(const Loc &other) const noexcept -> Loc
Create a new location representing the "union" of two locations.
Definition: token.hpp:39
int begin_col
Definition: token.hpp:28
const char * file
Definition: token.hpp:31
int begin_line
Definition: token.hpp:27
int end_line
Definition: token.hpp:29
A categorized token in source code, created by the tokenizer. These tokens are consumed by the lexer.
Definition: token.hpp:80
constexpr Token(Type type, Payload payload, int i, Loc loc) noexcept
Definition: token.hpp:118
Payload payload
Definition: token.hpp:108
Type type
Definition: token.hpp:107
optional< Atom > Payload
Definition: token.hpp:105
int index
Definition: token.hpp:109
constexpr Token(Type type, Payload payload) noexcept
Definition: token.hpp:117
static auto constexpr type_name(Type type) -> const char *
Definition: token.hpp:91
friend auto operator<<(llvm::raw_ostream &os, const Token &token) -> llvm::raw_ostream &
Definition: token.cpp:291
constexpr Token(Type type)
Definition: token.hpp:116
auto is_a(const std::pair< Type, Atom > &type_atom) const -> bool
Definition: token.hpp:112
#define YUME_ASSERT(assertion, message)
Definition: util.hpp:81