Refactor tokenization

2020-08-29 14:50:16 +02:00
parent 71678b2ec6
commit 473334c3db
12 changed files with 254 additions and 212 deletions
--- a/include/interpret/errors.hpp
+++ b/include/interpret/errors.hpp
@@ -8,7 +8,6 @@ namespace Interpret
 	struct InterpretationError : public std::exception
 	{
 		Token::Token errorToken;
 		std::string errorMsg;
 		InterpretationError(Token::Token const & token, std::string const & msg);
 	};
--- a/include/token/token.hpp
+++ b/include/token/token.hpp
@@ -46,8 +46,6 @@ namespace Token
 		static Token CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn);
 		static Token CreateMemoryToken(int const value, int const lineNumber, int const lineColumn);
-		void DebugPrint() const;
+		void Print() const;
 	};
 	void PrintTokens(std::vector<Token> const & tokens);
 }
--- a/include/token/tokenizer.hpp
+++ b/include/token/tokenizer.hpp
@@ -8,20 +8,16 @@ namespace Token
 	class Tokenizer
 	{
 	private:
-		std::vector<std::pair<std::string, std::string>> substitutions;
+		// argument for string should never be of length zero
 		Token ExtractToken(
-			std::string string,
+			std::string const & string,
-			int const lineNumber,
+			std::size_t const lineNumber,
-			int const lineColumn) const;
+			std::size_t const lineColumn) const;
 		void ParseCharacterLiteral(
 			std::string const & line,
 			int const lineNumber,
 			unsigned & lineColumn,
 			std::vector<Token> & tokens) const;
 	public:
-		void Tokenize(std::string const & line, int const lineNumber, std::vector<Token> & tokens);
+		void Tokenize(
 			std::string const & line,
 			std::size_t const lineNumber,
 			std::vector<Token> & tokens);
 	};
 }
--- a/include/utils.hpp
+++ b/include/utils.hpp
@@ -1,6 +1,18 @@
 #pragma once
 #include <optional>
 #include <string>
 namespace Utils
 {
 	bool isWhitespaceCharacter(char const c);
 	// Returns nullopt in case the value is missing its terminator character
 	std::optional<std::string> getValueSurroundedBy(
 		std::string const & src,
 		std::size_t const pos,
 		char const surroundingCharacter);
 	std::string getValueSurroundedByWhitespace(
 		std::string const & src,
 		std::size_t const pos);
 }
--- a/include/wassembler.hpp
+++ b/include/wassembler.hpp
@@ -10,13 +10,16 @@ private:
 	Configuration config;
 	Execute::VirtualMachine vm;
 	bool printSubstitutions;
 	bool printTokens;
 	bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const;
 	bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const;
 public:
 	void SetMemorySize(unsigned const size);
 	void EnableSubstitutionsLogging();
 	void EnableTokensLogging();
 	bool LoadFromFile(std::string const & filePath);
--- a/2
+++ b/2
@@ -13,7 +13,7 @@ BINARY = bin/wassembler
 all: ${BINARY}
 check: ${BINARY}
-	./$< ./bin/test.wasm -p
+	./$< ./bin/test.wasm
 clean:
 	-rm -rf build ./${BINARY}
--- a/src/interpret/errors.cpp
+++ b/src/interpret/errors.cpp
@@ -3,9 +3,9 @@
 namespace Interpret
 {
 	InterpretationError::InterpretationError(Token::Token const & token, std::string const & msg)
-		: errorToken(token),
+		: errorToken(token)
 		errorMsg(msg)
 	{
 		errorToken.errorMessage = msg;
 	}
 	ExpectedArgument::ExpectedArgument(Token::Token const & token)
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -8,11 +8,13 @@ int main(int argc, char ** argv)
 	std::string inputFile;
 	unsigned memorySize = 1024;
 	bool printSubstitutions = false;
 	bool printTokens = false;
 	auto cli = (
 		clipp::value("input wasm file").set(inputFile),
 		clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
-		clipp::option("-p", "--print-substitutions").set(printSubstitutions)
+		clipp::option("-ps", "--print-substitutions").set(printSubstitutions),
 		clipp::option("-pt", "--print-tokens").set(printTokens)
 	);
 	if (!clipp::parse(argc, argv, cli))
@@ -28,6 +30,11 @@ int main(int argc, char ** argv)
 		wassembler.EnableSubstitutionsLogging();
 	}
 	if (printTokens)
 	{
 		wassembler.EnableTokensLogging();
 	}
 	if (!wassembler.LoadFromFile(inputFile))
 	{
 		exit(1);
--- a/src/token/token.cpp
+++ b/src/token/token.cpp
@@ -108,7 +108,12 @@ namespace Token
 	Token Token::CreateMemoryToken(RegisterType const registerType, int const lineNumber, int const lineColumn)
 	{
-		return Token(TokenType::Memory, registerType, registerType != RegisterType::Unknown, lineNumber, lineColumn);
+		if (registerType == RegisterType::Unknown)
 		{
 			return CreateErrorToken("Unknown register used", TokenType::Register, lineNumber, lineColumn);
 		}
 		return Token(TokenType::Memory, registerType, true, lineNumber, lineColumn);
 	}
 	Token Token::CreateMemoryToken(int const value, int const lineNumber, int const lineColumn)
@@ -116,7 +121,7 @@ namespace Token
 		return Token(TokenType::Memory, value, true, lineNumber, lineColumn);
 	}
-	void Token::DebugPrint() const
+	void Token::Print() const
 	{
 		std::putc(' ', stdout);
 		switch(type)
@@ -207,28 +212,4 @@ namespace Token
 			break;
 		}
 	}
 	void PrintTokens(std::vector<Token> const & tokens)
 	{
 		std::puts("*** Tokenization result ***");
 		unsigned statementNumber = 0u;
 		std::printf("%02u - ", statementNumber);
 		for(unsigned i = 0u; i < tokens.size(); ++i)
 		{
 			auto const & token = tokens[i];
 			token.DebugPrint();
 			if (token.type == TokenType::StatementEnd)
 			{
 				++statementNumber;
 				if (i + 1 < tokens.size())
 				{
 					std::printf("\n%02u - ", statementNumber);
 				}
 				else
 				{
 					std::puts("");
 				}
 			}
 		}
 	}
 }
--- a/src/token/tokenizer.cpp
+++ b/src/token/tokenizer.cpp
@@ -20,25 +20,115 @@ namespace Token
 		}
 	}
-	Token Tokenizer::ExtractToken(std::string string,
+	Token GetCharacterLiteralToken(
-	int const lineNumber,
+		std::string const & token,
-	int const lineColumn) const
+		std::size_t const lineNumber,
 		std::size_t const lineColumn)
 	{
-		if (string.size() == 0)
+		for(std::size_t i = 1; i < token.size(); ++i)
 		{
-			// TODO Should this become an error token?
+			if (token[i] == '\'')
 			return Token::CreateEmptyToken(lineNumber, lineColumn);
 		}
 		for(std::size_t i = 0; i < substitutions.size(); ++i)
 		{
 			if (string == substitutions[i].first)
 			{
-				string = substitutions[i].second;
+				if (i != 2)
-				break;
+				{
 					return Token::CreateErrorToken(
 						"Character literal must be exactly 1 character long between single quotes",
 						TokenType::ImmediateInteger,
 						lineNumber,
 						lineColumn + 1u);
 				}
 				else
 				{
 					return Token::CreateImmediateValueToken(
 						token[1],
 						lineNumber,
 						lineColumn + 1);
 				}
 			}
 		}
 		return Token::CreateErrorToken(
 			"Non terminated character literal",
 			TokenType::ImmediateInteger,
 			lineNumber,
 			lineColumn);
 	}
 	Token GetMemoryToken(
 		std::string const & token,
 		std::size_t const lineNumber,
 		std::size_t const lineColumn)
 	{
 		// Minimal example: [$1] or [%A]
 		if(token.size() < 4)
 		{
 			return Token::CreateErrorToken(
 				"Memory address statement is empty",
 				TokenType::Memory,
 				lineNumber,
 				lineColumn);
 		}
 		if (token[0] != '[' || token[token.size() - 1] != ']')
 		{
 			return Token::CreateErrorToken(
 				"Non terminated memory address brackets",
 				TokenType::Memory,
 				lineNumber,
 				lineColumn);
 		}
 		char const memoryPrefix = token[1];
 		std::string const valueString = token.substr(2, token.size() - 3u);
 		if (memoryPrefix == '$')
 		{
 			auto const result = TryParseInt(valueString);
 			if (result.has_value())
 			{
 				return Token::CreateMemoryToken(
 					result.value(),
 					lineNumber,
 					lineColumn);
 			}
 			return Token::CreateErrorToken(
 				"Memory immediate address cannot be parsed as an integer",
 				TokenType::Memory,
 				lineNumber,
 				lineColumn);
 		}
 		else if (memoryPrefix == '%')
 		{
 			return Token::CreateMemoryToken(
 				GetRegisterType(valueString),
 				lineNumber,
 				lineColumn);
 		}
 		return Token::CreateErrorToken(
 			"Memory immediate address contains an unexpected value",
 			TokenType::Memory,
 			lineNumber,
 			lineColumn + 1u);
 	}
 	Token GetUnterminatedCharacterLiteralError(
 		std::size_t const lineNumber,
 		std::size_t const lineColumn)
 	{
 		return Token::CreateErrorToken(
 			"Unterminated character or string literal",
 			TokenType::Unknown,
 			lineNumber,
 			lineColumn);
 	}
 	Token Tokenizer::ExtractToken(
 		std::string const & string,
 		std::size_t const lineNumber,
 		std::size_t const lineColumn) const
 	{
 		char const prefix = string[0];
 		switch(prefix)
 		{
@@ -67,78 +157,36 @@ namespace Token
 				lineNumber,
 				lineColumn);
 			case '\'':
 			return GetCharacterLiteralToken(string, lineNumber, lineColumn);
 			case ';':
 			return Token::CreateStatementEndToken(lineNumber, lineColumn);
 			case '[':
 			return GetMemoryToken(string, lineNumber, lineColumn);
 			default:
 			break;
 		}
 		char const postfix = string[string.size() - 1];
-		if (postfix == ':')
+		switch(postfix)
 		{
 			case ']':
 			return GetMemoryToken(string, lineNumber, lineColumn);
 			case ':':
 			// TODO check if label is an Operand?
 			return Token::CreateLabelToken(
 				string.substr(0, string.size() - 1),
 				lineNumber,
 				lineColumn);
 		}
-		if (prefix == '[' && postfix == ']')
+			case '\'':
-		{
+			case '\"':
-			if(string.size() < 4)
+			// This shouldn't happen
-			{
+			return GetUnterminatedCharacterLiteralError(lineNumber, lineColumn);
 				return Token::CreateErrorToken(
 					"Memory address statement is empty",
 					TokenType::Memory,
 					lineNumber,
 					lineColumn);
 			}
 			char const memoryPrefix = string[1];
 			std::string const valueString = string.substr(2, string.size() - 3u);
 			if (memoryPrefix == '$')
 			{
 				auto const result = TryParseInt(valueString);
 				if (result.has_value())
 				{
 					return Token::CreateMemoryToken(
 						result.value(),
 						lineNumber,
 						lineColumn);
 				}
 				return Token::CreateErrorToken(
 					"Memory immediate address cannot be parsed as an integer",
 					TokenType::Memory,
 					lineNumber,
 					lineColumn);
 			}
 			else if (memoryPrefix == '%')
 			{
 				return Token::CreateMemoryToken(
 					GetRegisterType(valueString),
 					lineNumber,
 					lineColumn);
 			}
 			else
 			{
 				return Token::CreateErrorToken(
 					"Memory immediate address contains an unexpected value",
 					TokenType::Memory,
 					lineNumber,
 					lineColumn + 1u);
 			}
 		}
 		else if (prefix == '[' || postfix == ']')
 		{
 			int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u);
 			return Token::CreateErrorToken(
 				"Non terminated memory address brackets",
 				TokenType::Memory,
 				lineNumber,
 				errorLineColumn);
 		}
 		OperandType const opType = GetOperandType(string);
@@ -147,125 +195,69 @@ namespace Token
 			return Token::CreateOperandToken(opType, lineNumber, lineColumn);
 		}
-		// Last resort: it must be a label
+		// Last resort: it must be a jump target
 		return Token::CreateLabelToken(string, lineNumber, lineColumn);
 	}
 	// Modifies the lineColumn parameter to point at the character literal end
 	void Tokenizer::ParseCharacterLiteral(
 		std::string const & line,
 		int const lineNumber,
 		unsigned & lineColumn,
 		std::vector<Token> & tokens) const
 	{
 		for(unsigned int i = lineColumn + 1; i < line.size(); ++i)
 		{
 			if (line[i] == '\'')
 			{
 				if (lineColumn + 2u != i)
 				{
 					tokens.push_back(Token::CreateErrorToken(
 						"Character literal must be exactly 1 character long between single quotes",
 						TokenType::ImmediateInteger,
 						lineNumber,
 						lineColumn + 1u));
 				}
 				else
 				{
 					tokens.push_back(Token::CreateImmediateValueToken(
 						line[i - 1],
 						lineNumber,
 						lineColumn + 1));
 				}
 				lineColumn = i;
 				return;
 			}
 		}
 		tokens.push_back(Token::CreateErrorToken(
 			"Non terminated character literal",
 			TokenType::ImmediateInteger,
 			lineNumber,
 			lineColumn));
 		lineColumn = line.size();
 	}
 	void Tokenizer::Tokenize(
 		std::string const & line,
-		int const lineNumber,
+		std::size_t const lineNumber,
 		std::vector<Token> & tokens)
 	{
-		enum class TokenizerState
+		for(std::size_t column = 0u; column < line.size(); ++column)
 		{
-			LookForNextToken,
+			if (Utils::isWhitespaceCharacter(line[column]))
 			LookForTokenEnd,
 		};
 		TokenizerState state = TokenizerState::LookForNextToken;
 		unsigned columnTokenStart = 0;
 		for(unsigned column = 0u; column < line.size(); ++column)
 		{
 			switch(state)
 			{
-				case TokenizerState::LookForNextToken:
+				continue;
-				if (!Utils::isWhitespaceCharacter(line[column]))
+			}
 			switch(line[column])
 			{
 				case '\'':
 				case '\"':
 				{
-					if (line[column] == '\'')
+					auto const result = Utils::getValueSurroundedBy(
 						line,
 						column,
 						line[column]);
 					if (result.has_value())
 					{
-						// TODO integrate this better with the existing extract token
+						tokens.push_back(ExtractToken(result.value(), lineNumber, column));
-						// infrastructure
+						column += result.value().size() - 1;
 						ParseCharacterLiteral(line, lineNumber, column, tokens);
 						break;
 					}
-
+					else
 					columnTokenStart = column;
 					switch(line[column])
 					{
 						case ';':
 						tokens.push_back(
-							ExtractToken(line.substr(column, 1), lineNumber, column));
+							GetUnterminatedCharacterLiteralError(lineNumber, column));
 						break;
-						default:
+						// Parsing must stop here, the line is malformed
-						state = TokenizerState::LookForTokenEnd;
+						return;
 						break;
 					}
 				}
 				break;
-				case TokenizerState::LookForTokenEnd:
+				case ';':
-				if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
+				tokens.push_back(ExtractToken(";", lineNumber, column));
 				{
 					tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
 					if (line[column] == ';')
 					{
 						tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column));
 					}
 					state = TokenizerState::LookForNextToken;
 				}
 				break;
 				default:
-				std::puts("DEBUG: Unhandled TokenizerState value");
+				{
 					auto const result = Utils::getValueSurroundedByWhitespace(line, column);
 					auto const lastCharacterIndex = result.size() - 1;
 					if (result[lastCharacterIndex] == ';')
 					{
 						tokens.push_back(ExtractToken(result.substr(0, result.size() -1), lineNumber, column));
 						tokens.push_back(ExtractToken(";", lineNumber, column + lastCharacterIndex));
 					}
 					else
 					{
 						tokens.push_back(ExtractToken(result, lineNumber, column));
 					}
 					column += result.size();
 				}
 				break;
 			}
 		}
 		switch(state)
 		{
 			case TokenizerState::LookForTokenEnd:
 			tokens.push_back(ExtractToken(
 				line.substr(columnTokenStart, line.size()),
 				lineNumber,
 				columnTokenStart));
 			break;
 			case TokenizerState::LookForNextToken:
 			default:
 			break;
 		}
 	}
 }
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -6,4 +6,35 @@ namespace Utils
 	{
 		return c == '\n' || c == ' ' || c == '\t' || c == '\r';
 	}
 	std::optional<std::string> getValueSurroundedBy(
 		std::string const & src,
 		std::size_t const pos,
 		char const surroundingCharacter)
 	{
 		for(std::size_t i = pos + 1; i < src.size(); ++i)
 		{
 			if (src[i] == surroundingCharacter)
 			{
 				return std::make_optional(src.substr(pos, (i + 1) - pos));
 			}
 		}
 		return std::nullopt;
 	}
 	std::string getValueSurroundedByWhitespace(
 		std::string const & src,
 		std::size_t const pos)
 	{
 		for(std::size_t i = pos + 1; i < src.size(); ++i)
 		{
 			if (isWhitespaceCharacter(src[i]))
 			{
 				return src.substr(pos, i - pos);
 			}
 		}
 		return src.substr(pos);
 	}
 }
--- a/src/wassembler.cpp
+++ b/src/wassembler.cpp
@@ -21,7 +21,6 @@ void PrintBadToken(Token::Token const & token, std::vector<std::string> const &
 void PrintTokenError(Interpret::InterpretationError const & err, std::vector<std::string> const & lines)
 {
 	std::printf("%s ", err.errorMsg.c_str());
 	PrintBadToken(err.errorToken, lines);
 }
@@ -66,6 +65,24 @@ bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<
 		}
 	}
 	if (printTokens && tokens.size() > 0)
 	{
 		int previousLine = tokens[0].lineNumber;
 		std::printf("Line %04i: ", previousLine);
 		for(auto const & token : tokens)
 		{
 			if (token.lineNumber != previousLine)
 			{
 				std::putc('\n', stdout);
 				previousLine = token.lineNumber;
 				std::printf("Line %04i: ", previousLine);
 			}
 			token.Print();
 		}
 		std::putc('\n', stdout);
 	}
 	// Validate the syntax
 	bool syntaxError = false;
 	for(auto const & token : tokens)
@@ -91,6 +108,11 @@ void Wassembler::EnableSubstitutionsLogging()
 	printSubstitutions = true;
 }
 void Wassembler::EnableTokensLogging()
 {
 	printTokens = true;
 }
 bool Wassembler::LoadFromFile(std::string const & filePath)
 {
 	std::vector<std::string> lines;
@@ -122,6 +144,7 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
 	}
 	catch(Interpret::InterpretationError & e)
 	{
 		std::printf("Semantic error ");
 		PrintBadToken(e.errorToken, lines);
 		std::puts("Aborting due to semantic error(s)");
 		return false;