Refactor tokenization

This commit is contained in:
2020-08-29 14:50:16 +02:00
parent 71678b2ec6
commit 473334c3db
12 changed files with 254 additions and 212 deletions

View File

@@ -8,7 +8,6 @@ namespace Interpret
struct InterpretationError : public std::exception struct InterpretationError : public std::exception
{ {
Token::Token errorToken; Token::Token errorToken;
std::string errorMsg;
InterpretationError(Token::Token const & token, std::string const & msg); InterpretationError(Token::Token const & token, std::string const & msg);
}; };

View File

@@ -46,8 +46,6 @@ namespace Token
static Token CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn); static Token CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn);
static Token CreateMemoryToken(int const value, int const lineNumber, int const lineColumn); static Token CreateMemoryToken(int const value, int const lineNumber, int const lineColumn);
void DebugPrint() const; void Print() const;
}; };
void PrintTokens(std::vector<Token> const & tokens);
} }

View File

@@ -8,20 +8,16 @@ namespace Token
class Tokenizer class Tokenizer
{ {
private: private:
std::vector<std::pair<std::string, std::string>> substitutions; // argument for string should never be of length zero
Token ExtractToken( Token ExtractToken(
std::string string, std::string const & string,
int const lineNumber, std::size_t const lineNumber,
int const lineColumn) const; std::size_t const lineColumn) const;
void ParseCharacterLiteral(
std::string const & line,
int const lineNumber,
unsigned & lineColumn,
std::vector<Token> & tokens) const;
public: public:
void Tokenize(std::string const & line, int const lineNumber, std::vector<Token> & tokens); void Tokenize(
std::string const & line,
std::size_t const lineNumber,
std::vector<Token> & tokens);
}; };
} }

View File

@@ -1,6 +1,18 @@
#pragma once #pragma once
#include <optional>
#include <string>
namespace Utils namespace Utils
{ {
bool isWhitespaceCharacter(char const c); bool isWhitespaceCharacter(char const c);
// Returns nullopt in case the value is missing its terminator character
std::optional<std::string> getValueSurroundedBy(
std::string const & src,
std::size_t const pos,
char const surroundingCharacter);
std::string getValueSurroundedByWhitespace(
std::string const & src,
std::size_t const pos);
} }

View File

@@ -10,13 +10,16 @@ private:
Configuration config; Configuration config;
Execute::VirtualMachine vm; Execute::VirtualMachine vm;
bool printSubstitutions; bool printSubstitutions;
bool printTokens;
bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const; bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const;
bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const; bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const;
public: public:
void SetMemorySize(unsigned const size); void SetMemorySize(unsigned const size);
void EnableSubstitutionsLogging(); void EnableSubstitutionsLogging();
void EnableTokensLogging();
bool LoadFromFile(std::string const & filePath); bool LoadFromFile(std::string const & filePath);

View File

@@ -13,7 +13,7 @@ BINARY = bin/wassembler
all: ${BINARY} all: ${BINARY}
check: ${BINARY} check: ${BINARY}
./$< ./bin/test.wasm -p ./$< ./bin/test.wasm
clean: clean:
-rm -rf build ./${BINARY} -rm -rf build ./${BINARY}

View File

@@ -3,9 +3,9 @@
namespace Interpret namespace Interpret
{ {
InterpretationError::InterpretationError(Token::Token const & token, std::string const & msg) InterpretationError::InterpretationError(Token::Token const & token, std::string const & msg)
: errorToken(token), : errorToken(token)
errorMsg(msg)
{ {
errorToken.errorMessage = msg;
} }
ExpectedArgument::ExpectedArgument(Token::Token const & token) ExpectedArgument::ExpectedArgument(Token::Token const & token)

View File

@@ -8,11 +8,13 @@ int main(int argc, char ** argv)
std::string inputFile; std::string inputFile;
unsigned memorySize = 1024; unsigned memorySize = 1024;
bool printSubstitutions = false; bool printSubstitutions = false;
bool printTokens = false;
auto cli = ( auto cli = (
clipp::value("input wasm file").set(inputFile), clipp::value("input wasm file").set(inputFile),
clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize), clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
clipp::option("-p", "--print-substitutions").set(printSubstitutions) clipp::option("-ps", "--print-substitutions").set(printSubstitutions),
clipp::option("-pt", "--print-tokens").set(printTokens)
); );
if (!clipp::parse(argc, argv, cli)) if (!clipp::parse(argc, argv, cli))
@@ -28,6 +30,11 @@ int main(int argc, char ** argv)
wassembler.EnableSubstitutionsLogging(); wassembler.EnableSubstitutionsLogging();
} }
if (printTokens)
{
wassembler.EnableTokensLogging();
}
if (!wassembler.LoadFromFile(inputFile)) if (!wassembler.LoadFromFile(inputFile))
{ {
exit(1); exit(1);

View File

@@ -108,7 +108,12 @@ namespace Token
Token Token::CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn) Token Token::CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn)
{ {
return Token(TokenType::Memory, registerType, registerType != RegisterType::Unknown, lineNumber, lineColumn); if (registerType == RegisterType::Unknown)
{
return CreateErrorToken("Unknown register used", TokenType::Register, lineNumber, lineColumn);
}
return Token(TokenType::Memory, registerType, true, lineNumber, lineColumn);
} }
Token Token::CreateMemoryToken(int const value, int const lineNumber, int const lineColumn) Token Token::CreateMemoryToken(int const value, int const lineNumber, int const lineColumn)
@@ -116,7 +121,7 @@ namespace Token
return Token(TokenType::Memory, value, true, lineNumber, lineColumn); return Token(TokenType::Memory, value, true, lineNumber, lineColumn);
} }
void Token::DebugPrint() const void Token::Print() const
{ {
std::putc(' ', stdout); std::putc(' ', stdout);
switch(type) switch(type)
@@ -207,28 +212,4 @@ namespace Token
break; break;
} }
} }
void PrintTokens(std::vector<Token> const & tokens)
{
std::puts("*** Tokenization result ***");
unsigned statementNumber = 0u;
std::printf("%02u - ", statementNumber);
for(unsigned i = 0u; i < tokens.size(); ++i)
{
auto const & token = tokens[i];
token.DebugPrint();
if (token.type == TokenType::StatementEnd)
{
++statementNumber;
if (i + 1 < tokens.size())
{
std::printf("\n%02u - ", statementNumber);
}
else
{
std::puts("");
}
}
}
}
} }

View File

@@ -20,25 +20,115 @@ namespace Token
} }
} }
Token Tokenizer::ExtractToken(std::string string, Token GetCharacterLiteralToken(
int const lineNumber, std::string const & token,
int const lineColumn) const std::size_t const lineNumber,
std::size_t const lineColumn)
{ {
if (string.size() == 0) for(std::size_t i = 1; i < token.size(); ++i)
{ {
// TODO Should this become an error token? if (token[i] == '\'')
return Token::CreateEmptyToken(lineNumber, lineColumn);
}
for(std::size_t i = 0; i < substitutions.size(); ++i)
{
if (string == substitutions[i].first)
{ {
string = substitutions[i].second; if (i != 2)
break; {
return Token::CreateErrorToken(
"Character literal must be exactly 1 character long between single quotes",
TokenType::ImmediateInteger,
lineNumber,
lineColumn + 1u);
}
else
{
return Token::CreateImmediateValueToken(
token[1],
lineNumber,
lineColumn + 1);
}
} }
} }
return Token::CreateErrorToken(
"Non terminated character literal",
TokenType::ImmediateInteger,
lineNumber,
lineColumn);
}
Token GetMemoryToken(
std::string const & token,
std::size_t const lineNumber,
std::size_t const lineColumn)
{
// Minimal example: [$1] or [%A]
if(token.size() < 4)
{
return Token::CreateErrorToken(
"Memory address statement is empty",
TokenType::Memory,
lineNumber,
lineColumn);
}
if (token[0] != '[' || token[token.size() - 1] != ']')
{
return Token::CreateErrorToken(
"Non terminated memory address brackets",
TokenType::Memory,
lineNumber,
lineColumn);
}
char const memoryPrefix = token[1];
std::string const valueString = token.substr(2, token.size() - 3u);
if (memoryPrefix == '$')
{
auto const result = TryParseInt(valueString);
if (result.has_value())
{
return Token::CreateMemoryToken(
result.value(),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address cannot be parsed as an integer",
TokenType::Memory,
lineNumber,
lineColumn);
}
else if (memoryPrefix == '%')
{
return Token::CreateMemoryToken(
GetRegisterType(valueString),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address contains an unexpected value",
TokenType::Memory,
lineNumber,
lineColumn + 1u);
}
Token GetUnterminatedCharacterLiteralError(
std::size_t const lineNumber,
std::size_t const lineColumn)
{
return Token::CreateErrorToken(
"Unterminated character or string literal",
TokenType::Unknown,
lineNumber,
lineColumn);
}
Token Tokenizer::ExtractToken(
std::string const & string,
std::size_t const lineNumber,
std::size_t const lineColumn) const
{
char const prefix = string[0]; char const prefix = string[0];
switch(prefix) switch(prefix)
{ {
@@ -67,78 +157,36 @@ namespace Token
lineNumber, lineNumber,
lineColumn); lineColumn);
case '\'':
return GetCharacterLiteralToken(string, lineNumber, lineColumn);
case ';': case ';':
return Token::CreateStatementEndToken(lineNumber, lineColumn); return Token::CreateStatementEndToken(lineNumber, lineColumn);
case '[':
return GetMemoryToken(string, lineNumber, lineColumn);
default: default:
break; break;
} }
char const postfix = string[string.size() - 1]; char const postfix = string[string.size() - 1];
if (postfix == ':') switch(postfix)
{ {
case ']':
return GetMemoryToken(string, lineNumber, lineColumn);
case ':':
// TODO check if label is an Operand? // TODO check if label is an Operand?
return Token::CreateLabelToken( return Token::CreateLabelToken(
string.substr(0, string.size() - 1), string.substr(0, string.size() - 1),
lineNumber, lineNumber,
lineColumn); lineColumn);
}
if (prefix == '[' && postfix == ']') case '\'':
{ case '\"':
if(string.size() < 4) // This shouldn't happen
{ return GetUnterminatedCharacterLiteralError(lineNumber, lineColumn);
return Token::CreateErrorToken(
"Memory address statement is empty",
TokenType::Memory,
lineNumber,
lineColumn);
}
char const memoryPrefix = string[1];
std::string const valueString = string.substr(2, string.size() - 3u);
if (memoryPrefix == '$')
{
auto const result = TryParseInt(valueString);
if (result.has_value())
{
return Token::CreateMemoryToken(
result.value(),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address cannot be parsed as an integer",
TokenType::Memory,
lineNumber,
lineColumn);
}
else if (memoryPrefix == '%')
{
return Token::CreateMemoryToken(
GetRegisterType(valueString),
lineNumber,
lineColumn);
}
else
{
return Token::CreateErrorToken(
"Memory immediate address contains an unexpected value",
TokenType::Memory,
lineNumber,
lineColumn + 1u);
}
}
else if (prefix == '[' || postfix == ']')
{
int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u);
return Token::CreateErrorToken(
"Non terminated memory address brackets",
TokenType::Memory,
lineNumber,
errorLineColumn);
} }
OperandType const opType = GetOperandType(string); OperandType const opType = GetOperandType(string);
@@ -147,125 +195,69 @@ namespace Token
return Token::CreateOperandToken(opType, lineNumber, lineColumn); return Token::CreateOperandToken(opType, lineNumber, lineColumn);
} }
// Last resort: it must be a label // Last resort: it must be a jump target
return Token::CreateLabelToken(string, lineNumber, lineColumn); return Token::CreateLabelToken(string, lineNumber, lineColumn);
} }
// Modifies the lineColumn parameter to point at the character literal end
void Tokenizer::ParseCharacterLiteral(
std::string const & line,
int const lineNumber,
unsigned & lineColumn,
std::vector<Token> & tokens) const
{
for(unsigned int i = lineColumn + 1; i < line.size(); ++i)
{
if (line[i] == '\'')
{
if (lineColumn + 2u != i)
{
tokens.push_back(Token::CreateErrorToken(
"Character literal must be exactly 1 character long between single quotes",
TokenType::ImmediateInteger,
lineNumber,
lineColumn + 1u));
}
else
{
tokens.push_back(Token::CreateImmediateValueToken(
line[i - 1],
lineNumber,
lineColumn + 1));
}
lineColumn = i;
return;
}
}
tokens.push_back(Token::CreateErrorToken(
"Non terminated character literal",
TokenType::ImmediateInteger,
lineNumber,
lineColumn));
lineColumn = line.size();
}
void Tokenizer::Tokenize( void Tokenizer::Tokenize(
std::string const & line, std::string const & line,
int const lineNumber, std::size_t const lineNumber,
std::vector<Token> & tokens) std::vector<Token> & tokens)
{ {
enum class TokenizerState for(std::size_t column = 0u; column < line.size(); ++column)
{ {
LookForNextToken, if (Utils::isWhitespaceCharacter(line[column]))
LookForTokenEnd,
};
TokenizerState state = TokenizerState::LookForNextToken;
unsigned columnTokenStart = 0;
for(unsigned column = 0u; column < line.size(); ++column)
{
switch(state)
{ {
case TokenizerState::LookForNextToken: continue;
if (!Utils::isWhitespaceCharacter(line[column])) }
switch(line[column])
{
case '\'':
case '\"':
{ {
if (line[column] == '\'') auto const result = Utils::getValueSurroundedBy(
line,
column,
line[column]);
if (result.has_value())
{ {
// TODO integrate this better with the existing extract token tokens.push_back(ExtractToken(result.value(), lineNumber, column));
// infrastructure column += result.value().size() - 1;
ParseCharacterLiteral(line, lineNumber, column, tokens);
break;
} }
else
columnTokenStart = column;
switch(line[column])
{ {
case ';':
tokens.push_back( tokens.push_back(
ExtractToken(line.substr(column, 1), lineNumber, column)); GetUnterminatedCharacterLiteralError(lineNumber, column));
break;
default: // Parsing must stop here, the line is malformed
state = TokenizerState::LookForTokenEnd; return;
break;
} }
} }
break; break;
case TokenizerState::LookForTokenEnd: case ';':
if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';') tokens.push_back(ExtractToken(";", lineNumber, column));
{
tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
if (line[column] == ';')
{
tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column));
}
state = TokenizerState::LookForNextToken;
}
break; break;
default: default:
std::puts("DEBUG: Unhandled TokenizerState value"); {
auto const result = Utils::getValueSurroundedByWhitespace(line, column);
auto const lastCharacterIndex = result.size() - 1;
if (result[lastCharacterIndex] == ';')
{
tokens.push_back(ExtractToken(result.substr(0, result.size() -1), lineNumber, column));
tokens.push_back(ExtractToken(";", lineNumber, column + lastCharacterIndex));
}
else
{
tokens.push_back(ExtractToken(result, lineNumber, column));
}
column += result.size();
}
break; break;
} }
} }
switch(state)
{
case TokenizerState::LookForTokenEnd:
tokens.push_back(ExtractToken(
line.substr(columnTokenStart, line.size()),
lineNumber,
columnTokenStart));
break;
case TokenizerState::LookForNextToken:
default:
break;
}
} }
} }

View File

@@ -6,4 +6,35 @@ namespace Utils
{ {
return c == '\n' || c == ' ' || c == '\t' || c == '\r'; return c == '\n' || c == ' ' || c == '\t' || c == '\r';
} }
std::optional<std::string> getValueSurroundedBy(
std::string const & src,
std::size_t const pos,
char const surroundingCharacter)
{
for(std::size_t i = pos + 1; i < src.size(); ++i)
{
if (src[i] == surroundingCharacter)
{
return std::make_optional(src.substr(pos, (i + 1) - pos));
}
}
return std::nullopt;
}
std::string getValueSurroundedByWhitespace(
std::string const & src,
std::size_t const pos)
{
for(std::size_t i = pos + 1; i < src.size(); ++i)
{
if (isWhitespaceCharacter(src[i]))
{
return src.substr(pos, i - pos);
}
}
return src.substr(pos);
}
} }

View File

@@ -21,7 +21,6 @@ void PrintBadToken(Token::Token const & token, std::vector<std::string> const &
void PrintTokenError(Interpret::InterpretationError const & err, std::vector<std::string> const & lines) void PrintTokenError(Interpret::InterpretationError const & err, std::vector<std::string> const & lines)
{ {
std::printf("%s ", err.errorMsg.c_str());
PrintBadToken(err.errorToken, lines); PrintBadToken(err.errorToken, lines);
} }
@@ -66,6 +65,24 @@ bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<
} }
} }
if (printTokens && tokens.size() > 0)
{
int previousLine = tokens[0].lineNumber;
std::printf("Line %04i: ", previousLine);
for(auto const & token : tokens)
{
if (token.lineNumber != previousLine)
{
std::putc('\n', stdout);
previousLine = token.lineNumber;
std::printf("Line %04i: ", previousLine);
}
token.Print();
}
std::putc('\n', stdout);
}
// Validate the syntax // Validate the syntax
bool syntaxError = false; bool syntaxError = false;
for(auto const & token : tokens) for(auto const & token : tokens)
@@ -91,6 +108,11 @@ void Wassembler::EnableSubstitutionsLogging()
printSubstitutions = true; printSubstitutions = true;
} }
void Wassembler::EnableTokensLogging()
{
printTokens = true;
}
bool Wassembler::LoadFromFile(std::string const & filePath) bool Wassembler::LoadFromFile(std::string const & filePath)
{ {
std::vector<std::string> lines; std::vector<std::string> lines;
@@ -122,6 +144,7 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
} }
catch(Interpret::InterpretationError & e) catch(Interpret::InterpretationError & e)
{ {
std::printf("Semantic error ");
PrintBadToken(e.errorToken, lines); PrintBadToken(e.errorToken, lines);
std::puts("Aborting due to semantic error(s)"); std::puts("Aborting due to semantic error(s)");
return false; return false;