From 473334c3dba3c7ff9a21368ded7e6300f705eda5 Mon Sep 17 00:00:00 2001 From: Tijmen van Nesselrooij Date: Sat, 29 Aug 2020 14:50:16 +0200 Subject: [PATCH] Refactor tokenization --- include/interpret/errors.hpp | 1 - include/token/token.hpp | 4 +- include/token/tokenizer.hpp | 20 +-- include/utils.hpp | 12 ++ include/wassembler.hpp | 3 + makefile | 2 +- src/interpret/errors.cpp | 4 +- src/main.cpp | 9 +- src/token/token.cpp | 33 +--- src/token/tokenizer.cpp | 322 +++++++++++++++++------------------ src/utils.cpp | 31 ++++ src/wassembler.cpp | 25 ++- 12 files changed, 254 insertions(+), 212 deletions(-) diff --git a/include/interpret/errors.hpp b/include/interpret/errors.hpp index 066beea..f73efa6 100644 --- a/include/interpret/errors.hpp +++ b/include/interpret/errors.hpp @@ -8,7 +8,6 @@ namespace Interpret struct InterpretationError : public std::exception { Token::Token errorToken; - std::string errorMsg; InterpretationError(Token::Token const & token, std::string const & msg); }; diff --git a/include/token/token.hpp b/include/token/token.hpp index de60498..15bf2c6 100644 --- a/include/token/token.hpp +++ b/include/token/token.hpp @@ -46,8 +46,6 @@ namespace Token static Token CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn); static Token CreateMemoryToken(int const value, int const lineNumber, int const lineColumn); - void DebugPrint() const; + void Print() const; }; - - void PrintTokens(std::vector const & tokens); } diff --git a/include/token/tokenizer.hpp b/include/token/tokenizer.hpp index 1091d68..c2cfb4f 100644 --- a/include/token/tokenizer.hpp +++ b/include/token/tokenizer.hpp @@ -8,20 +8,16 @@ namespace Token class Tokenizer { private: - std::vector> substitutions; - + // argument for string should never be of length zero Token ExtractToken( - std::string string, - int const lineNumber, - int const lineColumn) const; - - void ParseCharacterLiteral( - std::string const & line, - int const lineNumber, - unsigned & lineColumn, - std::vector & tokens) const; + std::string const & string, + std::size_t const lineNumber, + std::size_t const lineColumn) const; public: - void Tokenize(std::string const & line, int const lineNumber, std::vector & tokens); + void Tokenize( + std::string const & line, + std::size_t const lineNumber, + std::vector & tokens); }; } \ No newline at end of file diff --git a/include/utils.hpp b/include/utils.hpp index ea585aa..181a91d 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -1,6 +1,18 @@ #pragma once +#include +#include namespace Utils { bool isWhitespaceCharacter(char const c); + + // Returns nullopt in case the value is missing its terminator character + std::optional getValueSurroundedBy( + std::string const & src, + std::size_t const pos, + char const surroundingCharacter); + + std::string getValueSurroundedByWhitespace( + std::string const & src, + std::size_t const pos); } \ No newline at end of file diff --git a/include/wassembler.hpp b/include/wassembler.hpp index dd9e5fe..998b27e 100644 --- a/include/wassembler.hpp +++ b/include/wassembler.hpp @@ -10,13 +10,16 @@ private: Configuration config; Execute::VirtualMachine vm; bool printSubstitutions; + bool printTokens; bool LoadLinesFromFile(std::string const & filePath, std::vector & lines) const; bool LoadTokens(std::vector const & lines, std::vector & tokens) const; public: void SetMemorySize(unsigned const size); + void EnableSubstitutionsLogging(); + void EnableTokensLogging(); bool LoadFromFile(std::string const & filePath); diff --git a/makefile b/makefile index 220736a..40d93ba 100644 --- a/makefile +++ b/makefile @@ -13,7 +13,7 @@ BINARY = bin/wassembler all: ${BINARY} check: ${BINARY} - ./$< ./bin/test.wasm -p + ./$< ./bin/test.wasm clean: -rm -rf build ./${BINARY} diff --git a/src/interpret/errors.cpp b/src/interpret/errors.cpp index 9d0f36e..1c07a0f 100644 --- a/src/interpret/errors.cpp +++ b/src/interpret/errors.cpp @@ -3,9 +3,9 @@ namespace Interpret { InterpretationError::InterpretationError(Token::Token const & token, std::string const & msg) - : errorToken(token), - errorMsg(msg) + : errorToken(token) { + errorToken.errorMessage = msg; } ExpectedArgument::ExpectedArgument(Token::Token const & token) diff --git a/src/main.cpp b/src/main.cpp index 03c9119..5d25070 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,11 +8,13 @@ int main(int argc, char ** argv) std::string inputFile; unsigned memorySize = 1024; bool printSubstitutions = false; + bool printTokens = false; auto cli = ( clipp::value("input wasm file").set(inputFile), clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize), - clipp::option("-p", "--print-substitutions").set(printSubstitutions) + clipp::option("-ps", "--print-substitutions").set(printSubstitutions), + clipp::option("-pt", "--print-tokens").set(printTokens) ); if (!clipp::parse(argc, argv, cli)) @@ -28,6 +30,11 @@ int main(int argc, char ** argv) wassembler.EnableSubstitutionsLogging(); } + if (printTokens) + { + wassembler.EnableTokensLogging(); + } + if (!wassembler.LoadFromFile(inputFile)) { exit(1); diff --git a/src/token/token.cpp b/src/token/token.cpp index d56e14a..cacc5c6 100644 --- a/src/token/token.cpp +++ b/src/token/token.cpp @@ -108,7 +108,12 @@ namespace Token Token Token::CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn) { - return Token(TokenType::Memory, registerType, registerType != RegisterType::Unknown, lineNumber, lineColumn); + if (registerType == RegisterType::Unknown) + { + return CreateErrorToken("Unknown register used", TokenType::Register, lineNumber, lineColumn); + } + + return Token(TokenType::Memory, registerType, true, lineNumber, lineColumn); } Token Token::CreateMemoryToken(int const value, int const lineNumber, int const lineColumn) @@ -116,7 +121,7 @@ namespace Token return Token(TokenType::Memory, value, true, lineNumber, lineColumn); } - void Token::DebugPrint() const + void Token::Print() const { std::putc(' ', stdout); switch(type) @@ -207,28 +212,4 @@ namespace Token break; } } - - void PrintTokens(std::vector const & tokens) - { - std::puts("*** Tokenization result ***"); - unsigned statementNumber = 0u; - std::printf("%02u - ", statementNumber); - for(unsigned i = 0u; i < tokens.size(); ++i) - { - auto const & token = tokens[i]; - token.DebugPrint(); - if (token.type == TokenType::StatementEnd) - { - ++statementNumber; - if (i + 1 < tokens.size()) - { - std::printf("\n%02u - ", statementNumber); - } - else - { - std::puts(""); - } - } - } - } } \ No newline at end of file diff --git a/src/token/tokenizer.cpp b/src/token/tokenizer.cpp index a6244ee..476538f 100644 --- a/src/token/tokenizer.cpp +++ b/src/token/tokenizer.cpp @@ -20,25 +20,115 @@ namespace Token } } - Token Tokenizer::ExtractToken(std::string string, - int const lineNumber, - int const lineColumn) const + Token GetCharacterLiteralToken( + std::string const & token, + std::size_t const lineNumber, + std::size_t const lineColumn) { - if (string.size() == 0) + for(std::size_t i = 1; i < token.size(); ++i) { - // TODO Should this become an error token? - return Token::CreateEmptyToken(lineNumber, lineColumn); - } - - for(std::size_t i = 0; i < substitutions.size(); ++i) - { - if (string == substitutions[i].first) + if (token[i] == '\'') { - string = substitutions[i].second; - break; + if (i != 2) + { + return Token::CreateErrorToken( + "Character literal must be exactly 1 character long between single quotes", + TokenType::ImmediateInteger, + lineNumber, + lineColumn + 1u); + } + else + { + return Token::CreateImmediateValueToken( + token[1], + lineNumber, + lineColumn + 1); + } } } + return Token::CreateErrorToken( + "Non terminated character literal", + TokenType::ImmediateInteger, + lineNumber, + lineColumn); + } + + Token GetMemoryToken( + std::string const & token, + std::size_t const lineNumber, + std::size_t const lineColumn) + { + // Minimal example: [$1] or [%A] + if(token.size() < 4) + { + return Token::CreateErrorToken( + "Memory address statement is empty", + TokenType::Memory, + lineNumber, + lineColumn); + } + + if (token[0] != '[' || token[token.size() - 1] != ']') + { + return Token::CreateErrorToken( + "Non terminated memory address brackets", + TokenType::Memory, + lineNumber, + lineColumn); + } + + char const memoryPrefix = token[1]; + std::string const valueString = token.substr(2, token.size() - 3u); + if (memoryPrefix == '$') + { + auto const result = TryParseInt(valueString); + + if (result.has_value()) + { + return Token::CreateMemoryToken( + result.value(), + lineNumber, + lineColumn); + } + + return Token::CreateErrorToken( + "Memory immediate address cannot be parsed as an integer", + TokenType::Memory, + lineNumber, + lineColumn); + } + else if (memoryPrefix == '%') + { + return Token::CreateMemoryToken( + GetRegisterType(valueString), + lineNumber, + lineColumn); + } + + return Token::CreateErrorToken( + "Memory immediate address contains an unexpected value", + TokenType::Memory, + lineNumber, + lineColumn + 1u); + } + + Token GetUnterminatedCharacterLiteralError( + std::size_t const lineNumber, + std::size_t const lineColumn) + { + return Token::CreateErrorToken( + "Unterminated character or string literal", + TokenType::Unknown, + lineNumber, + lineColumn); + } + + Token Tokenizer::ExtractToken( + std::string const & string, + std::size_t const lineNumber, + std::size_t const lineColumn) const + { char const prefix = string[0]; switch(prefix) { @@ -67,78 +157,36 @@ namespace Token lineNumber, lineColumn); + case '\'': + return GetCharacterLiteralToken(string, lineNumber, lineColumn); + case ';': return Token::CreateStatementEndToken(lineNumber, lineColumn); + case '[': + return GetMemoryToken(string, lineNumber, lineColumn); + default: break; } char const postfix = string[string.size() - 1]; - if (postfix == ':') + switch(postfix) { + case ']': + return GetMemoryToken(string, lineNumber, lineColumn); + + case ':': // TODO check if label is an Operand? return Token::CreateLabelToken( string.substr(0, string.size() - 1), lineNumber, lineColumn); - } - if (prefix == '[' && postfix == ']') - { - if(string.size() < 4) - { - return Token::CreateErrorToken( - "Memory address statement is empty", - TokenType::Memory, - lineNumber, - lineColumn); - } - - char const memoryPrefix = string[1]; - std::string const valueString = string.substr(2, string.size() - 3u); - if (memoryPrefix == '$') - { - auto const result = TryParseInt(valueString); - - if (result.has_value()) - { - return Token::CreateMemoryToken( - result.value(), - lineNumber, - lineColumn); - } - - return Token::CreateErrorToken( - "Memory immediate address cannot be parsed as an integer", - TokenType::Memory, - lineNumber, - lineColumn); - } - else if (memoryPrefix == '%') - { - return Token::CreateMemoryToken( - GetRegisterType(valueString), - lineNumber, - lineColumn); - } - else - { - return Token::CreateErrorToken( - "Memory immediate address contains an unexpected value", - TokenType::Memory, - lineNumber, - lineColumn + 1u); - } - } - else if (prefix == '[' || postfix == ']') - { - int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u); - return Token::CreateErrorToken( - "Non terminated memory address brackets", - TokenType::Memory, - lineNumber, - errorLineColumn); + case '\'': + case '\"': + // This shouldn't happen + return GetUnterminatedCharacterLiteralError(lineNumber, lineColumn); } OperandType const opType = GetOperandType(string); @@ -147,125 +195,69 @@ namespace Token return Token::CreateOperandToken(opType, lineNumber, lineColumn); } - // Last resort: it must be a label + // Last resort: it must be a jump target return Token::CreateLabelToken(string, lineNumber, lineColumn); } - // Modifies the lineColumn parameter to point at the character literal end - void Tokenizer::ParseCharacterLiteral( - std::string const & line, - int const lineNumber, - unsigned & lineColumn, - std::vector & tokens) const - { - for(unsigned int i = lineColumn + 1; i < line.size(); ++i) - { - if (line[i] == '\'') - { - if (lineColumn + 2u != i) - { - tokens.push_back(Token::CreateErrorToken( - "Character literal must be exactly 1 character long between single quotes", - TokenType::ImmediateInteger, - lineNumber, - lineColumn + 1u)); - } - else - { - tokens.push_back(Token::CreateImmediateValueToken( - line[i - 1], - lineNumber, - lineColumn + 1)); - } - - lineColumn = i; - return; - } - } - - tokens.push_back(Token::CreateErrorToken( - "Non terminated character literal", - TokenType::ImmediateInteger, - lineNumber, - lineColumn)); - - lineColumn = line.size(); - } - void Tokenizer::Tokenize( std::string const & line, - int const lineNumber, + std::size_t const lineNumber, std::vector & tokens) { - enum class TokenizerState + for(std::size_t column = 0u; column < line.size(); ++column) { - LookForNextToken, - LookForTokenEnd, - }; - - TokenizerState state = TokenizerState::LookForNextToken; - unsigned columnTokenStart = 0; - for(unsigned column = 0u; column < line.size(); ++column) - { - switch(state) + if (Utils::isWhitespaceCharacter(line[column])) { - case TokenizerState::LookForNextToken: - if (!Utils::isWhitespaceCharacter(line[column])) + continue; + } + + switch(line[column]) + { + case '\'': + case '\"': { - if (line[column] == '\'') + auto const result = Utils::getValueSurroundedBy( + line, + column, + line[column]); + if (result.has_value()) { - // TODO integrate this better with the existing extract token - // infrastructure - ParseCharacterLiteral(line, lineNumber, column, tokens); - break; + tokens.push_back(ExtractToken(result.value(), lineNumber, column)); + column += result.value().size() - 1; } - - columnTokenStart = column; - - switch(line[column]) + else { - case ';': tokens.push_back( - ExtractToken(line.substr(column, 1), lineNumber, column)); - break; + GetUnterminatedCharacterLiteralError(lineNumber, column)); - default: - state = TokenizerState::LookForTokenEnd; - break; + // Parsing must stop here, the line is malformed + return; } } break; - case TokenizerState::LookForTokenEnd: - if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';') - { - tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart)); - if (line[column] == ';') - { - tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column)); - } - state = TokenizerState::LookForNextToken; - } + case ';': + tokens.push_back(ExtractToken(";", lineNumber, column)); break; default: - std::puts("DEBUG: Unhandled TokenizerState value"); + { + auto const result = Utils::getValueSurroundedByWhitespace(line, column); + auto const lastCharacterIndex = result.size() - 1; + if (result[lastCharacterIndex] == ';') + { + tokens.push_back(ExtractToken(result.substr(0, result.size() -1), lineNumber, column)); + tokens.push_back(ExtractToken(";", lineNumber, column + lastCharacterIndex)); + } + else + { + tokens.push_back(ExtractToken(result, lineNumber, column)); + } + + column += result.size(); + } break; } } - - switch(state) - { - case TokenizerState::LookForTokenEnd: - tokens.push_back(ExtractToken( - line.substr(columnTokenStart, line.size()), - lineNumber, - columnTokenStart)); - break; - - case TokenizerState::LookForNextToken: - default: - break; - } } } \ No newline at end of file diff --git a/src/utils.cpp b/src/utils.cpp index b929180..0984bfc 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -6,4 +6,35 @@ namespace Utils { return c == '\n' || c == ' ' || c == '\t' || c == '\r'; } + + std::optional getValueSurroundedBy( + std::string const & src, + std::size_t const pos, + char const surroundingCharacter) + { + for(std::size_t i = pos + 1; i < src.size(); ++i) + { + if (src[i] == surroundingCharacter) + { + return std::make_optional(src.substr(pos, (i + 1) - pos)); + } + } + + return std::nullopt; + } + + std::string getValueSurroundedByWhitespace( + std::string const & src, + std::size_t const pos) + { + for(std::size_t i = pos + 1; i < src.size(); ++i) + { + if (isWhitespaceCharacter(src[i])) + { + return src.substr(pos, i - pos); + } + } + + return src.substr(pos); + } } \ No newline at end of file diff --git a/src/wassembler.cpp b/src/wassembler.cpp index 8d8ea8a..349cac4 100644 --- a/src/wassembler.cpp +++ b/src/wassembler.cpp @@ -21,7 +21,6 @@ void PrintBadToken(Token::Token const & token, std::vector const & void PrintTokenError(Interpret::InterpretationError const & err, std::vector const & lines) { - std::printf("%s ", err.errorMsg.c_str()); PrintBadToken(err.errorToken, lines); } @@ -66,6 +65,24 @@ bool Wassembler::LoadTokens(std::vector const & lines, std::vector< } } + if (printTokens && tokens.size() > 0) + { + int previousLine = tokens[0].lineNumber; + std::printf("Line %04i: ", previousLine); + for(auto const & token : tokens) + { + if (token.lineNumber != previousLine) + { + std::putc('\n', stdout); + previousLine = token.lineNumber; + std::printf("Line %04i: ", previousLine); + } + + token.Print(); + } + std::putc('\n', stdout); + } + // Validate the syntax bool syntaxError = false; for(auto const & token : tokens) @@ -91,6 +108,11 @@ void Wassembler::EnableSubstitutionsLogging() printSubstitutions = true; } +void Wassembler::EnableTokensLogging() +{ + printTokens = true; +} + bool Wassembler::LoadFromFile(std::string const & filePath) { std::vector lines; @@ -122,6 +144,7 @@ bool Wassembler::LoadFromFile(std::string const & filePath) } catch(Interpret::InterpretationError & e) { + std::printf("Semantic error "); PrintBadToken(e.errorToken, lines); std::puts("Aborting due to semantic error(s)"); return false;