Refactor tokenization

2020-08-29 14:50:16 +02:00
parent 71678b2ec6
commit 473334c3db
12 changed files with 254 additions and 212 deletions
--- a/include/interpret/errors.hpp
+++ b/include/interpret/errors.hpp
@@ -8,7 +8,6 @@ namespace Interpret
 	struct InterpretationError : public std::exception
 	{
 		Token::Token errorToken;
-		std::string errorMsg;
 		InterpretationError(Token::Token const & token, std::string const & msg);
 	};

--- a/include/token/token.hpp
+++ b/include/token/token.hpp
@@ -46,8 +46,6 @@ namespace Token
 		static Token CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn);
 		static Token CreateMemoryToken(int const value, int const lineNumber, int const lineColumn);

-		void DebugPrint() const;
+		void Print() const;
 	};
-
-	void PrintTokens(std::vector<Token> const & tokens);
 }
--- a/include/token/tokenizer.hpp
+++ b/include/token/tokenizer.hpp
@@ -8,20 +8,16 @@ namespace Token
 	class Tokenizer
 	{
 	private:
-		std::vector<std::pair<std::string, std::string>> substitutions;
-
+		// argument for string should never be of length zero
 		Token ExtractToken(
-			std::string string,
-			int const lineNumber,
-			int const lineColumn) const;
-
-		void ParseCharacterLiteral(
-			std::string const & line,
-			int const lineNumber,
-			unsigned & lineColumn,
-			std::vector<Token> & tokens) const;
+			std::string const & string,
+			std::size_t const lineNumber,
+			std::size_t const lineColumn) const;

 	public:
-		void Tokenize(std::string const & line, int const lineNumber, std::vector<Token> & tokens);
+		void Tokenize(
+			std::string const & line,
+			std::size_t const lineNumber,
+			std::vector<Token> & tokens);
 	};
 }
--- a/include/utils.hpp
+++ b/include/utils.hpp
@@ -1,6 +1,18 @@
 #pragma once
+#include <optional>
+#include <string>

 namespace Utils
 {
 	bool isWhitespaceCharacter(char const c);
+
+	// Returns nullopt in case the value is missing its terminator character
+	std::optional<std::string> getValueSurroundedBy(
+		std::string const & src,
+		std::size_t const pos,
+		char const surroundingCharacter);
+
+	std::string getValueSurroundedByWhitespace(
+		std::string const & src,
+		std::size_t const pos);
 }
--- a/include/wassembler.hpp
+++ b/include/wassembler.hpp
@@ -10,13 +10,16 @@ private:
 	Configuration config;
 	Execute::VirtualMachine vm;
 	bool printSubstitutions;
+	bool printTokens;

 	bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const;
 	bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const;

 public:
 	void SetMemorySize(unsigned const size);
+
 	void EnableSubstitutionsLogging();
+	void EnableTokensLogging();

 	bool LoadFromFile(std::string const & filePath);

--- a/2
+++ b/2
@@ -13,7 +13,7 @@ BINARY = bin/wassembler
 all: ${BINARY}

 check: ${BINARY}
-	./$< ./bin/test.wasm -p
+	./$< ./bin/test.wasm

 clean:
 	-rm -rf build ./${BINARY}
--- a/src/interpret/errors.cpp
+++ b/src/interpret/errors.cpp
@@ -3,9 +3,9 @@
 namespace Interpret
 {
 	InterpretationError::InterpretationError(Token::Token const & token, std::string const & msg)
-		: errorToken(token),
-		errorMsg(msg)
+		: errorToken(token)
 	{
+		errorToken.errorMessage = msg;
 	}

 	ExpectedArgument::ExpectedArgument(Token::Token const & token)
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -8,11 +8,13 @@ int main(int argc, char ** argv)
 	std::string inputFile;
 	unsigned memorySize = 1024;
 	bool printSubstitutions = false;
+	bool printTokens = false;

 	auto cli = (
 		clipp::value("input wasm file").set(inputFile),
 		clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
-		clipp::option("-p", "--print-substitutions").set(printSubstitutions)
+		clipp::option("-ps", "--print-substitutions").set(printSubstitutions),
+		clipp::option("-pt", "--print-tokens").set(printTokens)
 	);

 	if (!clipp::parse(argc, argv, cli))
@@ -28,6 +30,11 @@ int main(int argc, char ** argv)
 		wassembler.EnableSubstitutionsLogging();
 	}

+	if (printTokens)
+	{
+		wassembler.EnableTokensLogging();
+	}
+
 	if (!wassembler.LoadFromFile(inputFile))
 	{
 		exit(1);
--- a/src/token/token.cpp
+++ b/src/token/token.cpp
@@ -108,7 +108,12 @@ namespace Token

 	Token Token::CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn)
 	{
-		return Token(TokenType::Memory, registerType, registerType != RegisterType::Unknown, lineNumber, lineColumn);
+		if (registerType == RegisterType::Unknown)
+		{
+			return CreateErrorToken("Unknown register used", TokenType::Register, lineNumber, lineColumn);
+		}
+
+		return Token(TokenType::Memory, registerType, true, lineNumber, lineColumn);
 	}

 	Token Token::CreateMemoryToken(int const value, int const lineNumber, int const lineColumn)
@@ -116,7 +121,7 @@ namespace Token
 		return Token(TokenType::Memory, value, true, lineNumber, lineColumn);
 	}

-	void Token::DebugPrint() const
+	void Token::Print() const
 	{
 		std::putc(' ', stdout);
 		switch(type)
@@ -207,28 +212,4 @@ namespace Token
 			break;
 		}
 	}
-
-	void PrintTokens(std::vector<Token> const & tokens)
-	{
-		std::puts("*** Tokenization result ***");
-		unsigned statementNumber = 0u;
-		std::printf("%02u - ", statementNumber);
-		for(unsigned i = 0u; i < tokens.size(); ++i)
-		{
-			auto const & token = tokens[i];
-			token.DebugPrint();
-			if (token.type == TokenType::StatementEnd)
-			{
-				++statementNumber;
-				if (i + 1 < tokens.size())
-				{
-					std::printf("\n%02u - ", statementNumber);
-				}
-				else
-				{
-					std::puts("");
-				}
-			}
-		}
-	}
 }
--- a/src/token/tokenizer.cpp
+++ b/src/token/tokenizer.cpp
@@ -20,25 +20,115 @@ namespace Token
 		}
 	}

-	Token Tokenizer::ExtractToken(std::string string,
-	int const lineNumber,
-	int const lineColumn) const
+	Token GetCharacterLiteralToken(
+		std::string const & token,
+		std::size_t const lineNumber,
+		std::size_t const lineColumn)
 	{
-		if (string.size() == 0)
+		for(std::size_t i = 1; i < token.size(); ++i)
 		{
-			// TODO Should this become an error token?
-			return Token::CreateEmptyToken(lineNumber, lineColumn);
+			if (token[i] == '\'')
+			{
+				if (i != 2)
+				{
+					return Token::CreateErrorToken(
+						"Character literal must be exactly 1 character long between single quotes",
+						TokenType::ImmediateInteger,
+						lineNumber,
+						lineColumn + 1u);
 				}
-
-		for(std::size_t i = 0; i < substitutions.size(); ++i)
+				else
 				{
-			if (string == substitutions[i].first)
-			{
-				string = substitutions[i].second;
-				break;
+					return Token::CreateImmediateValueToken(
+						token[1],
+						lineNumber,
+						lineColumn + 1);
+				}
 			}
 		}

+		return Token::CreateErrorToken(
+			"Non terminated character literal",
+			TokenType::ImmediateInteger,
+			lineNumber,
+			lineColumn);
+	}
+
+	Token GetMemoryToken(
+		std::string const & token,
+		std::size_t const lineNumber,
+		std::size_t const lineColumn)
+	{
+		// Minimal example: [$1] or [%A]
+		if(token.size() < 4)
+		{
+			return Token::CreateErrorToken(
+				"Memory address statement is empty",
+				TokenType::Memory,
+				lineNumber,
+				lineColumn);
+		}
+
+		if (token[0] != '[' || token[token.size() - 1] != ']')
+		{
+			return Token::CreateErrorToken(
+				"Non terminated memory address brackets",
+				TokenType::Memory,
+				lineNumber,
+				lineColumn);
+		}
+
+		char const memoryPrefix = token[1];
+		std::string const valueString = token.substr(2, token.size() - 3u);
+		if (memoryPrefix == '$')
+		{
+			auto const result = TryParseInt(valueString);
+
+			if (result.has_value())
+			{
+				return Token::CreateMemoryToken(
+					result.value(),
+					lineNumber,
+					lineColumn);
+			}
+
+			return Token::CreateErrorToken(
+				"Memory immediate address cannot be parsed as an integer",
+				TokenType::Memory,
+				lineNumber,
+				lineColumn);
+		}
+		else if (memoryPrefix == '%')
+		{
+			return Token::CreateMemoryToken(
+				GetRegisterType(valueString),
+				lineNumber,
+				lineColumn);
+		}
+
+		return Token::CreateErrorToken(
+			"Memory immediate address contains an unexpected value",
+			TokenType::Memory,
+			lineNumber,
+			lineColumn + 1u);
+	}
+
+	Token GetUnterminatedCharacterLiteralError(
+		std::size_t const lineNumber,
+		std::size_t const lineColumn)
+	{
+		return Token::CreateErrorToken(
+			"Unterminated character or string literal",
+			TokenType::Unknown,
+			lineNumber,
+			lineColumn);
+	}
+
+	Token Tokenizer::ExtractToken(
+		std::string const & string,
+		std::size_t const lineNumber,
+		std::size_t const lineColumn) const
+	{
 		char const prefix = string[0];
 		switch(prefix)
 		{
@@ -67,78 +157,36 @@ namespace Token
 				lineNumber,
 				lineColumn);

+			case '\'':
+			return GetCharacterLiteralToken(string, lineNumber, lineColumn);
+
 			case ';':
 			return Token::CreateStatementEndToken(lineNumber, lineColumn);

+			case '[':
+			return GetMemoryToken(string, lineNumber, lineColumn);
+
 			default:
 			break;
 		}

 		char const postfix = string[string.size() - 1];
-		if (postfix == ':')
+		switch(postfix)
 		{
+			case ']':
+			return GetMemoryToken(string, lineNumber, lineColumn);
+
+			case ':':
 			// TODO check if label is an Operand?
 			return Token::CreateLabelToken(
 				string.substr(0, string.size() - 1),
 				lineNumber,
 				lineColumn);
-		}

-		if (prefix == '[' && postfix == ']')
-		{
-			if(string.size() < 4)
-			{
-				return Token::CreateErrorToken(
-					"Memory address statement is empty",
-					TokenType::Memory,
-					lineNumber,
-					lineColumn);
-			}
-
-			char const memoryPrefix = string[1];
-			std::string const valueString = string.substr(2, string.size() - 3u);
-			if (memoryPrefix == '$')
-			{
-				auto const result = TryParseInt(valueString);
-
-				if (result.has_value())
-				{
-					return Token::CreateMemoryToken(
-						result.value(),
-						lineNumber,
-						lineColumn);
-				}
-
-				return Token::CreateErrorToken(
-					"Memory immediate address cannot be parsed as an integer",
-					TokenType::Memory,
-					lineNumber,
-					lineColumn);
-			}
-			else if (memoryPrefix == '%')
-			{
-				return Token::CreateMemoryToken(
-					GetRegisterType(valueString),
-					lineNumber,
-					lineColumn);
-			}
-			else
-			{
-				return Token::CreateErrorToken(
-					"Memory immediate address contains an unexpected value",
-					TokenType::Memory,
-					lineNumber,
-					lineColumn + 1u);
-			}
-		}
-		else if (prefix == '[' || postfix == ']')
-		{
-			int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u);
-			return Token::CreateErrorToken(
-				"Non terminated memory address brackets",
-				TokenType::Memory,
-				lineNumber,
-				errorLineColumn);
+			case '\'':
+			case '\"':
+			// This shouldn't happen
+			return GetUnterminatedCharacterLiteralError(lineNumber, lineColumn);
 		}

 		OperandType const opType = GetOperandType(string);
@@ -147,125 +195,69 @@ namespace Token
 			return Token::CreateOperandToken(opType, lineNumber, lineColumn);
 		}

-		// Last resort: it must be a label
+		// Last resort: it must be a jump target
 		return Token::CreateLabelToken(string, lineNumber, lineColumn);
 	}

-	// Modifies the lineColumn parameter to point at the character literal end
-	void Tokenizer::ParseCharacterLiteral(
-		std::string const & line,
-		int const lineNumber,
-		unsigned & lineColumn,
-		std::vector<Token> & tokens) const
-	{
-		for(unsigned int i = lineColumn + 1; i < line.size(); ++i)
-		{
-			if (line[i] == '\'')
-			{
-				if (lineColumn + 2u != i)
-				{
-					tokens.push_back(Token::CreateErrorToken(
-						"Character literal must be exactly 1 character long between single quotes",
-						TokenType::ImmediateInteger,
-						lineNumber,
-						lineColumn + 1u));
-				}
-				else
-				{
-					tokens.push_back(Token::CreateImmediateValueToken(
-						line[i - 1],
-						lineNumber,
-						lineColumn + 1));
-				}
-
-				lineColumn = i;
-				return;
-			}
-		}
-
-		tokens.push_back(Token::CreateErrorToken(
-			"Non terminated character literal",
-			TokenType::ImmediateInteger,
-			lineNumber,
-			lineColumn));
-
-		lineColumn = line.size();
-	}
-
 	void Tokenizer::Tokenize(
 		std::string const & line,
-		int const lineNumber,
+		std::size_t const lineNumber,
 		std::vector<Token> & tokens)
 	{
-		enum class TokenizerState
+		for(std::size_t column = 0u; column < line.size(); ++column)
 		{
-			LookForNextToken,
-			LookForTokenEnd,
-		};
-
-		TokenizerState state = TokenizerState::LookForNextToken;
-		unsigned columnTokenStart = 0;
-		for(unsigned column = 0u; column < line.size(); ++column)
+			if (Utils::isWhitespaceCharacter(line[column]))
 			{
-			switch(state)
-			{
-				case TokenizerState::LookForNextToken:
-				if (!Utils::isWhitespaceCharacter(line[column]))
-				{
-					if (line[column] == '\'')
-					{
-						// TODO integrate this better with the existing extract token
-						// infrastructure
-						ParseCharacterLiteral(line, lineNumber, column, tokens);
-						break;
+				continue;
 			}

-					columnTokenStart = column;
-
 			switch(line[column])
 			{
-						case ';':
+				case '\'':
+				case '\"':
+				{
+					auto const result = Utils::getValueSurroundedBy(
+						line,
+						column,
+						line[column]);
+					if (result.has_value())
+					{
+						tokens.push_back(ExtractToken(result.value(), lineNumber, column));
+						column += result.value().size() - 1;
+					}
+					else
+					{
 						tokens.push_back(
-							ExtractToken(line.substr(column, 1), lineNumber, column));
+							GetUnterminatedCharacterLiteralError(lineNumber, column));
+
+						// Parsing must stop here, the line is malformed
+						return;
+					}
+				}
+				break;
+
+				case ';':
+				tokens.push_back(ExtractToken(";", lineNumber, column));
 				break;

 				default:
-						state = TokenizerState::LookForTokenEnd;
-						break;
-					}
-				}
-				break;
-
-				case TokenizerState::LookForTokenEnd:
-				if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
 				{
-					tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
-					if (line[column] == ';')
+					auto const result = Utils::getValueSurroundedByWhitespace(line, column);
+					auto const lastCharacterIndex = result.size() - 1;
+					if (result[lastCharacterIndex] == ';')
 					{
-						tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column));
+						tokens.push_back(ExtractToken(result.substr(0, result.size() -1), lineNumber, column));
+						tokens.push_back(ExtractToken(";", lineNumber, column + lastCharacterIndex));
 					}
-					state = TokenizerState::LookForNextToken;
-				}
-				break;
-
-				default:
-				std::puts("DEBUG: Unhandled TokenizerState value");
-				break;
-			}
-		}
-
-		switch(state)
+					else
 					{
-			case TokenizerState::LookForTokenEnd:
-			tokens.push_back(ExtractToken(
-				line.substr(columnTokenStart, line.size()),
-				lineNumber,
-				columnTokenStart));
-			break;
+						tokens.push_back(ExtractToken(result, lineNumber, column));
+					}

-			case TokenizerState::LookForNextToken:
-			default:
+					column += result.size();
+				}
 				break;
 			}
 		}
+	}
 }
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -6,4 +6,35 @@ namespace Utils
 	{
 		return c == '\n' || c == ' ' || c == '\t' || c == '\r';
 	}
+
+	std::optional<std::string> getValueSurroundedBy(
+		std::string const & src,
+		std::size_t const pos,
+		char const surroundingCharacter)
+	{
+		for(std::size_t i = pos + 1; i < src.size(); ++i)
+		{
+			if (src[i] == surroundingCharacter)
+			{
+				return std::make_optional(src.substr(pos, (i + 1) - pos));
+			}
+		}
+
+		return std::nullopt;
+	}
+
+	std::string getValueSurroundedByWhitespace(
+		std::string const & src,
+		std::size_t const pos)
+	{
+		for(std::size_t i = pos + 1; i < src.size(); ++i)
+		{
+			if (isWhitespaceCharacter(src[i]))
+			{
+				return src.substr(pos, i - pos);
+			}
+		}
+
+		return src.substr(pos);
+	}
 }
--- a/src/wassembler.cpp
+++ b/src/wassembler.cpp
@@ -21,7 +21,6 @@ void PrintBadToken(Token::Token const & token, std::vector<std::string> const &

 void PrintTokenError(Interpret::InterpretationError const & err, std::vector<std::string> const & lines)
 {
-	std::printf("%s ", err.errorMsg.c_str());
 	PrintBadToken(err.errorToken, lines);
 }

@@ -66,6 +65,24 @@ bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<
 		}
 	}

+	if (printTokens && tokens.size() > 0)
+	{
+		int previousLine = tokens[0].lineNumber;
+		std::printf("Line %04i: ", previousLine);
+		for(auto const & token : tokens)
+		{
+			if (token.lineNumber != previousLine)
+			{
+				std::putc('\n', stdout);
+				previousLine = token.lineNumber;
+				std::printf("Line %04i: ", previousLine);
+			}
+
+			token.Print();
+		}
+		std::putc('\n', stdout);
+	}
+
 	// Validate the syntax
 	bool syntaxError = false;
 	for(auto const & token : tokens)
@@ -91,6 +108,11 @@ void Wassembler::EnableSubstitutionsLogging()
 	printSubstitutions = true;
 }

+void Wassembler::EnableTokensLogging()
+{
+	printTokens = true;
+}
+
 bool Wassembler::LoadFromFile(std::string const & filePath)
 {
 	std::vector<std::string> lines;
@@ -122,6 +144,7 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
 	}
 	catch(Interpret::InterpretationError & e)
 	{
+		std::printf("Semantic error ");
 		PrintBadToken(e.errorToken, lines);
 		std::puts("Aborting due to semantic error(s)");
 		return false;