Refactor tokenization

This commit is contained in:
2020-08-29 14:50:16 +02:00
parent 71678b2ec6
commit 473334c3db
12 changed files with 254 additions and 212 deletions

View File

@@ -8,7 +8,6 @@ namespace Interpret
struct InterpretationError : public std::exception
{
Token::Token errorToken;
std::string errorMsg;
InterpretationError(Token::Token const & token, std::string const & msg);
};

View File

@@ -46,8 +46,6 @@ namespace Token
static Token CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn);
static Token CreateMemoryToken(int const value, int const lineNumber, int const lineColumn);
void DebugPrint() const;
void Print() const;
};
void PrintTokens(std::vector<Token> const & tokens);
}

View File

@@ -8,20 +8,16 @@ namespace Token
class Tokenizer
{
private:
std::vector<std::pair<std::string, std::string>> substitutions;
// argument for string should never be of length zero
Token ExtractToken(
std::string string,
int const lineNumber,
int const lineColumn) const;
void ParseCharacterLiteral(
std::string const & line,
int const lineNumber,
unsigned & lineColumn,
std::vector<Token> & tokens) const;
std::string const & string,
std::size_t const lineNumber,
std::size_t const lineColumn) const;
public:
void Tokenize(std::string const & line, int const lineNumber, std::vector<Token> & tokens);
void Tokenize(
std::string const & line,
std::size_t const lineNumber,
std::vector<Token> & tokens);
};
}

View File

@@ -1,6 +1,18 @@
#pragma once
#include <optional>
#include <string>
namespace Utils
{
bool isWhitespaceCharacter(char const c);
// Returns nullopt in case the value is missing its terminator character
std::optional<std::string> getValueSurroundedBy(
std::string const & src,
std::size_t const pos,
char const surroundingCharacter);
std::string getValueSurroundedByWhitespace(
std::string const & src,
std::size_t const pos);
}

View File

@@ -10,13 +10,16 @@ private:
Configuration config;
Execute::VirtualMachine vm;
bool printSubstitutions;
bool printTokens;
bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const;
bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const;
public:
void SetMemorySize(unsigned const size);
void EnableSubstitutionsLogging();
void EnableTokensLogging();
bool LoadFromFile(std::string const & filePath);

View File

@@ -13,7 +13,7 @@ BINARY = bin/wassembler
all: ${BINARY}
check: ${BINARY}
./$< ./bin/test.wasm -p
./$< ./bin/test.wasm
clean:
-rm -rf build ./${BINARY}

View File

@@ -3,9 +3,9 @@
namespace Interpret
{
InterpretationError::InterpretationError(Token::Token const & token, std::string const & msg)
: errorToken(token),
errorMsg(msg)
: errorToken(token)
{
errorToken.errorMessage = msg;
}
ExpectedArgument::ExpectedArgument(Token::Token const & token)

View File

@@ -8,11 +8,13 @@ int main(int argc, char ** argv)
std::string inputFile;
unsigned memorySize = 1024;
bool printSubstitutions = false;
bool printTokens = false;
auto cli = (
clipp::value("input wasm file").set(inputFile),
clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
clipp::option("-p", "--print-substitutions").set(printSubstitutions)
clipp::option("-ps", "--print-substitutions").set(printSubstitutions),
clipp::option("-pt", "--print-tokens").set(printTokens)
);
if (!clipp::parse(argc, argv, cli))
@@ -28,6 +30,11 @@ int main(int argc, char ** argv)
wassembler.EnableSubstitutionsLogging();
}
if (printTokens)
{
wassembler.EnableTokensLogging();
}
if (!wassembler.LoadFromFile(inputFile))
{
exit(1);

View File

@@ -108,7 +108,12 @@ namespace Token
Token Token::CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn)
{
return Token(TokenType::Memory, registerType, registerType != RegisterType::Unknown, lineNumber, lineColumn);
if (registerType == RegisterType::Unknown)
{
return CreateErrorToken("Unknown register used", TokenType::Register, lineNumber, lineColumn);
}
return Token(TokenType::Memory, registerType, true, lineNumber, lineColumn);
}
Token Token::CreateMemoryToken(int const value, int const lineNumber, int const lineColumn)
@@ -116,7 +121,7 @@ namespace Token
return Token(TokenType::Memory, value, true, lineNumber, lineColumn);
}
void Token::DebugPrint() const
void Token::Print() const
{
std::putc(' ', stdout);
switch(type)
@@ -207,28 +212,4 @@ namespace Token
break;
}
}
void PrintTokens(std::vector<Token> const & tokens)
{
std::puts("*** Tokenization result ***");
unsigned statementNumber = 0u;
std::printf("%02u - ", statementNumber);
for(unsigned i = 0u; i < tokens.size(); ++i)
{
auto const & token = tokens[i];
token.DebugPrint();
if (token.type == TokenType::StatementEnd)
{
++statementNumber;
if (i + 1 < tokens.size())
{
std::printf("\n%02u - ", statementNumber);
}
else
{
std::puts("");
}
}
}
}
}

View File

@@ -20,25 +20,115 @@ namespace Token
}
}
Token Tokenizer::ExtractToken(std::string string,
int const lineNumber,
int const lineColumn) const
Token GetCharacterLiteralToken(
std::string const & token,
std::size_t const lineNumber,
std::size_t const lineColumn)
{
if (string.size() == 0)
for(std::size_t i = 1; i < token.size(); ++i)
{
// TODO Should this become an error token?
return Token::CreateEmptyToken(lineNumber, lineColumn);
}
for(std::size_t i = 0; i < substitutions.size(); ++i)
{
if (string == substitutions[i].first)
if (token[i] == '\'')
{
string = substitutions[i].second;
break;
if (i != 2)
{
return Token::CreateErrorToken(
"Character literal must be exactly 1 character long between single quotes",
TokenType::ImmediateInteger,
lineNumber,
lineColumn + 1u);
}
else
{
return Token::CreateImmediateValueToken(
token[1],
lineNumber,
lineColumn + 1);
}
}
}
return Token::CreateErrorToken(
"Non terminated character literal",
TokenType::ImmediateInteger,
lineNumber,
lineColumn);
}
Token GetMemoryToken(
std::string const & token,
std::size_t const lineNumber,
std::size_t const lineColumn)
{
// Minimal example: [$1] or [%A]
if(token.size() < 4)
{
return Token::CreateErrorToken(
"Memory address statement is empty",
TokenType::Memory,
lineNumber,
lineColumn);
}
if (token[0] != '[' || token[token.size() - 1] != ']')
{
return Token::CreateErrorToken(
"Non terminated memory address brackets",
TokenType::Memory,
lineNumber,
lineColumn);
}
char const memoryPrefix = token[1];
std::string const valueString = token.substr(2, token.size() - 3u);
if (memoryPrefix == '$')
{
auto const result = TryParseInt(valueString);
if (result.has_value())
{
return Token::CreateMemoryToken(
result.value(),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address cannot be parsed as an integer",
TokenType::Memory,
lineNumber,
lineColumn);
}
else if (memoryPrefix == '%')
{
return Token::CreateMemoryToken(
GetRegisterType(valueString),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address contains an unexpected value",
TokenType::Memory,
lineNumber,
lineColumn + 1u);
}
Token GetUnterminatedCharacterLiteralError(
std::size_t const lineNumber,
std::size_t const lineColumn)
{
return Token::CreateErrorToken(
"Unterminated character or string literal",
TokenType::Unknown,
lineNumber,
lineColumn);
}
Token Tokenizer::ExtractToken(
std::string const & string,
std::size_t const lineNumber,
std::size_t const lineColumn) const
{
char const prefix = string[0];
switch(prefix)
{
@@ -67,78 +157,36 @@ namespace Token
lineNumber,
lineColumn);
case '\'':
return GetCharacterLiteralToken(string, lineNumber, lineColumn);
case ';':
return Token::CreateStatementEndToken(lineNumber, lineColumn);
case '[':
return GetMemoryToken(string, lineNumber, lineColumn);
default:
break;
}
char const postfix = string[string.size() - 1];
if (postfix == ':')
switch(postfix)
{
case ']':
return GetMemoryToken(string, lineNumber, lineColumn);
case ':':
// TODO check if label is an Operand?
return Token::CreateLabelToken(
string.substr(0, string.size() - 1),
lineNumber,
lineColumn);
}
if (prefix == '[' && postfix == ']')
{
if(string.size() < 4)
{
return Token::CreateErrorToken(
"Memory address statement is empty",
TokenType::Memory,
lineNumber,
lineColumn);
}
char const memoryPrefix = string[1];
std::string const valueString = string.substr(2, string.size() - 3u);
if (memoryPrefix == '$')
{
auto const result = TryParseInt(valueString);
if (result.has_value())
{
return Token::CreateMemoryToken(
result.value(),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address cannot be parsed as an integer",
TokenType::Memory,
lineNumber,
lineColumn);
}
else if (memoryPrefix == '%')
{
return Token::CreateMemoryToken(
GetRegisterType(valueString),
lineNumber,
lineColumn);
}
else
{
return Token::CreateErrorToken(
"Memory immediate address contains an unexpected value",
TokenType::Memory,
lineNumber,
lineColumn + 1u);
}
}
else if (prefix == '[' || postfix == ']')
{
int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u);
return Token::CreateErrorToken(
"Non terminated memory address brackets",
TokenType::Memory,
lineNumber,
errorLineColumn);
case '\'':
case '\"':
// This shouldn't happen
return GetUnterminatedCharacterLiteralError(lineNumber, lineColumn);
}
OperandType const opType = GetOperandType(string);
@@ -147,125 +195,69 @@ namespace Token
return Token::CreateOperandToken(opType, lineNumber, lineColumn);
}
// Last resort: it must be a label
// Last resort: it must be a jump target
return Token::CreateLabelToken(string, lineNumber, lineColumn);
}
// Modifies the lineColumn parameter to point at the character literal end
void Tokenizer::ParseCharacterLiteral(
std::string const & line,
int const lineNumber,
unsigned & lineColumn,
std::vector<Token> & tokens) const
{
for(unsigned int i = lineColumn + 1; i < line.size(); ++i)
{
if (line[i] == '\'')
{
if (lineColumn + 2u != i)
{
tokens.push_back(Token::CreateErrorToken(
"Character literal must be exactly 1 character long between single quotes",
TokenType::ImmediateInteger,
lineNumber,
lineColumn + 1u));
}
else
{
tokens.push_back(Token::CreateImmediateValueToken(
line[i - 1],
lineNumber,
lineColumn + 1));
}
lineColumn = i;
return;
}
}
tokens.push_back(Token::CreateErrorToken(
"Non terminated character literal",
TokenType::ImmediateInteger,
lineNumber,
lineColumn));
lineColumn = line.size();
}
void Tokenizer::Tokenize(
std::string const & line,
int const lineNumber,
std::size_t const lineNumber,
std::vector<Token> & tokens)
{
enum class TokenizerState
for(std::size_t column = 0u; column < line.size(); ++column)
{
LookForNextToken,
LookForTokenEnd,
};
TokenizerState state = TokenizerState::LookForNextToken;
unsigned columnTokenStart = 0;
for(unsigned column = 0u; column < line.size(); ++column)
{
switch(state)
if (Utils::isWhitespaceCharacter(line[column]))
{
case TokenizerState::LookForNextToken:
if (!Utils::isWhitespaceCharacter(line[column]))
continue;
}
switch(line[column])
{
case '\'':
case '\"':
{
if (line[column] == '\'')
auto const result = Utils::getValueSurroundedBy(
line,
column,
line[column]);
if (result.has_value())
{
// TODO integrate this better with the existing extract token
// infrastructure
ParseCharacterLiteral(line, lineNumber, column, tokens);
break;
tokens.push_back(ExtractToken(result.value(), lineNumber, column));
column += result.value().size() - 1;
}
columnTokenStart = column;
switch(line[column])
else
{
case ';':
tokens.push_back(
ExtractToken(line.substr(column, 1), lineNumber, column));
break;
GetUnterminatedCharacterLiteralError(lineNumber, column));
default:
state = TokenizerState::LookForTokenEnd;
break;
// Parsing must stop here, the line is malformed
return;
}
}
break;
case TokenizerState::LookForTokenEnd:
if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
{
tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
if (line[column] == ';')
{
tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column));
}
state = TokenizerState::LookForNextToken;
}
case ';':
tokens.push_back(ExtractToken(";", lineNumber, column));
break;
default:
std::puts("DEBUG: Unhandled TokenizerState value");
{
auto const result = Utils::getValueSurroundedByWhitespace(line, column);
auto const lastCharacterIndex = result.size() - 1;
if (result[lastCharacterIndex] == ';')
{
tokens.push_back(ExtractToken(result.substr(0, result.size() -1), lineNumber, column));
tokens.push_back(ExtractToken(";", lineNumber, column + lastCharacterIndex));
}
else
{
tokens.push_back(ExtractToken(result, lineNumber, column));
}
column += result.size();
}
break;
}
}
switch(state)
{
case TokenizerState::LookForTokenEnd:
tokens.push_back(ExtractToken(
line.substr(columnTokenStart, line.size()),
lineNumber,
columnTokenStart));
break;
case TokenizerState::LookForNextToken:
default:
break;
}
}
}

View File

@@ -6,4 +6,35 @@ namespace Utils
{
return c == '\n' || c == ' ' || c == '\t' || c == '\r';
}
std::optional<std::string> getValueSurroundedBy(
std::string const & src,
std::size_t const pos,
char const surroundingCharacter)
{
for(std::size_t i = pos + 1; i < src.size(); ++i)
{
if (src[i] == surroundingCharacter)
{
return std::make_optional(src.substr(pos, (i + 1) - pos));
}
}
return std::nullopt;
}
std::string getValueSurroundedByWhitespace(
std::string const & src,
std::size_t const pos)
{
for(std::size_t i = pos + 1; i < src.size(); ++i)
{
if (isWhitespaceCharacter(src[i]))
{
return src.substr(pos, i - pos);
}
}
return src.substr(pos);
}
}

View File

@@ -21,7 +21,6 @@ void PrintBadToken(Token::Token const & token, std::vector<std::string> const &
void PrintTokenError(Interpret::InterpretationError const & err, std::vector<std::string> const & lines)
{
std::printf("%s ", err.errorMsg.c_str());
PrintBadToken(err.errorToken, lines);
}
@@ -66,6 +65,24 @@ bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<
}
}
if (printTokens && tokens.size() > 0)
{
int previousLine = tokens[0].lineNumber;
std::printf("Line %04i: ", previousLine);
for(auto const & token : tokens)
{
if (token.lineNumber != previousLine)
{
std::putc('\n', stdout);
previousLine = token.lineNumber;
std::printf("Line %04i: ", previousLine);
}
token.Print();
}
std::putc('\n', stdout);
}
// Validate the syntax
bool syntaxError = false;
for(auto const & token : tokens)
@@ -91,6 +108,11 @@ void Wassembler::EnableSubstitutionsLogging()
printSubstitutions = true;
}
void Wassembler::EnableTokensLogging()
{
printTokens = true;
}
bool Wassembler::LoadFromFile(std::string const & filePath)
{
std::vector<std::string> lines;
@@ -122,6 +144,7 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
}
catch(Interpret::InterpretationError & e)
{
std::printf("Semantic error ");
PrintBadToken(e.errorToken, lines);
std::puts("Aborting due to semantic error(s)");
return false;