Extract preprocessing from tokenizer

2020-08-29 11:25:10 +02:00
parent aebc1dd86d
commit 71678b2ec6
12 changed files with 254 additions and 134 deletions
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,10 +7,12 @@ int main(int argc, char ** argv)
 {
 	std::string inputFile;
 	unsigned memorySize = 1024;
+	bool printSubstitutions = false;

 	auto cli = (
 		clipp::value("input wasm file").set(inputFile),
-		clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize)
+		clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
+		clipp::option("-p", "--print-substitutions").set(printSubstitutions)
 	);

 	if (!clipp::parse(argc, argv, cli))
@@ -21,6 +23,11 @@ int main(int argc, char ** argv)

 	Wassembler wassembler;
 	wassembler.SetMemorySize(memorySize);
+	if (printSubstitutions)
+	{
+		wassembler.EnableSubstitutionsLogging();
+	}
+
 	if (!wassembler.LoadFromFile(inputFile))
 	{
 		exit(1);
--- a/src/preprocessor/preprocessor.cpp
+++ b/src/preprocessor/preprocessor.cpp
@@ -0,0 +1,159 @@
+#include <preprocessor/preprocessor.hpp>
+#include <utils.hpp>
+
+bool trySubstitute(
+	std::string & line,
+	std::size_t const lineColumn,
+	std::vector<std::string> const & substitutionIdentifiers,
+	std::vector<std::string> const & substitutionValues)
+{
+	for(std::size_t i = 0; i < substitutionIdentifiers.size(); ++i)
+	{
+		if (line.compare(lineColumn, substitutionIdentifiers[i].size(), substitutionIdentifiers[i]) != 0)
+		{
+			continue;
+		}
+
+		std::string const lineCopy = line;
+		line = lineCopy.substr(0, lineColumn) + substitutionValues[i];
+		if (lineCopy.size() > lineColumn + substitutionIdentifiers[i].size())
+		{
+			line += lineCopy.substr(lineColumn + substitutionIdentifiers[i].size(), lineCopy.size());
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+void Preprocessor::extractComment(
+	std::string & line,
+	std::size_t const lineNumber,
+	std::size_t const lineColumn)
+{
+	if (line.size() <= lineColumn + 1 ||
+		line.compare(lineColumn + 1, std::string::npos, "DEFINE") <= 0)
+	{
+		// No match or empty DEFINE statement
+		line = line.substr(0, lineColumn);
+	}
+
+	enum CommentParseState
+	{
+		LookForArgumentStart,
+		LookForArgumentEnd
+	};
+
+	std::string firstArgument, secondArgument;
+	std::size_t argumentCount = 0, argumentStart = 0;
+	CommentParseState state = LookForArgumentStart;
+	for(std::size_t i = lineColumn + 7; i < line.size(); ++i)
+	{
+		switch(state)
+		{
+			case LookForArgumentStart:
+			if(!Utils::isWhitespaceCharacter(line[i]))
+			{
+				argumentStart = i;
+				state = CommentParseState::LookForArgumentEnd;
+			}
+			break;
+
+			case LookForArgumentEnd:
+			if (Utils::isWhitespaceCharacter(line[i]))
+			{
+				switch(argumentCount)
+				{
+					case 0:
+					firstArgument = line.substr(argumentStart, i - argumentStart);
+					break;
+
+					case 1:
+					secondArgument = line.substr(argumentStart, i - argumentStart);
+					break;
+
+					default:
+					break;
+				}
+
+				++argumentCount;
+				state = CommentParseState::LookForArgumentStart;
+			}
+			break;
+		}
+	}
+
+	switch(state)
+	{
+		case CommentParseState::LookForArgumentStart:
+		break;
+
+		case CommentParseState::LookForArgumentEnd:
+		switch(argumentCount)
+		{
+			case 0:
+			firstArgument = line.substr(argumentStart);
+			break;
+
+			case 1:
+			secondArgument = line.substr(argumentStart);
+			break;
+
+			default:
+			break;
+		}
+		++argumentCount;
+		break;
+	}
+
+	if (argumentCount > 0)
+	{
+		substitutionIdentifiers.push_back(firstArgument);
+		substitutionValues.push_back(secondArgument);
+	}
+
+	line = line.substr(0, lineColumn);
+}
+
+void Preprocessor::processLine(std::string & line, std::size_t const lineNumber)
+{
+	for(std::size_t i = 0; i < line.size(); ++i)
+	{
+		if (!Utils::isWhitespaceCharacter(line[i]))
+		{
+			if (trySubstitute(line, i, substitutionIdentifiers, substitutionValues))
+			{
+				continue;
+			}
+
+			if (line[i] == '#')
+			{
+				extractComment(line, lineNumber, i);
+				return;
+			}
+		}
+	}
+}
+
+void Preprocessor::process(std::vector<std::string> & lines)
+{
+	substitutionIdentifiers.clear();
+	substitutionValues.clear();
+
+	for(std::size_t i = 0; i < lines.size(); ++i)
+	{
+		processLine(lines[i], i);
+	}
+}
+
+void Preprocessor::printSubstitutions() const
+{
+	for(std::size_t i = 0; i < substitutionIdentifiers.size(); ++i)
+	{
+		std::printf(
+			"%s -> %s\n",
+			substitutionIdentifiers[i].c_str(),
+			substitutionValues[i].c_str());
+	}
+}
--- a/src/token/tokenizer.cpp
+++ b/src/token/tokenizer.cpp
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <token/errors.hpp>
 #include <token/tokenizer.hpp>
+#include <utils.hpp>

 namespace Token
 {
@@ -150,111 +151,6 @@ namespace Token
 		return Token::CreateLabelToken(string, lineNumber, lineColumn);
 	}

-	bool IsWhiteSpace(char const c)
-	{
-		return c == '\n' || c == ' ' || c == '\t' || c == '\r';
-	}
-
-	void Tokenizer::ParseComment(
-		std::string const & string,
-		int const lineNumber,
-		int const lineColumn)
-	{
-		unsigned const commentContentStart = lineColumn + 1;
-		if (string.size() < commentContentStart ||
-			IsWhiteSpace(string[commentContentStart]))
-		{
-			return;
-		}
-
-		enum class CommentParseState
-		{
-			LookForDirectiveEnd,
-			LookForArgumentStart,
-			LookForArgumentEnd
-		};
-		std::string firstArgument, secondArgument;
-		unsigned argumentCount = 0, argumentStart = 0;
-		CommentParseState state = CommentParseState::LookForDirectiveEnd;
-		for(unsigned i = commentContentStart + 1; i < string.size(); ++i)
-		{
-			switch(state)
-			{
-				case CommentParseState::LookForDirectiveEnd:
-				if(IsWhiteSpace(string[i]))
-				{
-					if (string.compare(commentContentStart, i - commentContentStart, "DEFINE"))
-					{
-						// Nonzero = not equal
-						return;
-					}
-
-					state = CommentParseState::LookForArgumentStart;
-				}
-				break;
-
-				case CommentParseState::LookForArgumentStart:
-				if(!IsWhiteSpace(string[i]))
-				{
-					argumentStart = i;
-					state = CommentParseState::LookForArgumentEnd;
-				}
-				break;
-
-				case CommentParseState::LookForArgumentEnd:
-				if (IsWhiteSpace(string[i]))
-				{
-					state = CommentParseState::LookForArgumentStart;
-					switch(argumentCount)
-					{
-						case 0:
-						firstArgument = string.substr(argumentStart, i - argumentStart);
-						break;
-
-						case 1:
-						secondArgument = string.substr(argumentStart, i - argumentStart);
-						break;
-
-						default:
-						goto end_state_loop;
-					}
-					++argumentCount;
-				}
-				break;
-			}
-		}
-end_state_loop:
-		switch(state)
-		{
-			case CommentParseState::LookForDirectiveEnd:
-			case CommentParseState::LookForArgumentStart:
-			break;
-
-			case CommentParseState::LookForArgumentEnd:
-			switch(argumentCount)
-			{
-				case 0:
-				firstArgument = string.substr(argumentStart);
-				break;
-
-				case 1:
-				secondArgument = string.substr(argumentStart);
-				break;
-			}
-			++argumentCount;
-			break;
-		}
-
-		if (argumentCount > 0)
-		{
-			substitutions.push_back(std::make_pair(firstArgument, secondArgument));
-		}
-		else
-		{
-			std::printf("WARNING: DEFINE with no arguments on line %u\n", lineNumber + 1);
-		}
-	}
-
 	// Modifies the lineColumn parameter to point at the character literal end
 	void Tokenizer::ParseCharacterLiteral(
 		std::string const & line,
@@ -314,16 +210,12 @@ end_state_loop:
 			switch(state)
 			{
 				case TokenizerState::LookForNextToken:
-				if (!IsWhiteSpace(line[column]))
+				if (!Utils::isWhitespaceCharacter(line[column]))
 				{
-					if (line[column] == '#')
-					{
-						ParseComment(line, lineNumber, column);
-						return;
-					}
-
 					if (line[column] == '\'')
 					{
+						// TODO integrate this better with the existing extract token
+						// infrastructure
 						ParseCharacterLiteral(line, lineNumber, column, tokens);
 						break;
 					}
@@ -345,7 +237,7 @@ end_state_loop:
 				break;

 				case TokenizerState::LookForTokenEnd:
-				if (IsWhiteSpace(line[column]) || line[column] == ';')
+				if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
 				{
 					tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
 					if (line[column] == ';')
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -0,0 +1,9 @@
+#include <utils.hpp>
+
+namespace Utils
+{
+	bool isWhitespaceCharacter(char const c)
+	{
+		return c == '\n' || c == ' ' || c == '\t' || c == '\r';
+	}
+}
--- a/src/wassembler.cpp
+++ b/src/wassembler.cpp
@@ -1,5 +1,6 @@
 #include <fstream>
 #include <interpret/errors.hpp>
+#include <preprocessor/preprocessor.hpp>
 #include <token/errors.hpp>
 #include <wassembler.hpp>

@@ -48,7 +49,6 @@ bool Wassembler::LoadLinesFromFile(std::string const & filePath, std::vector<std
 	return true;
 }

-
 bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const
 {
 	Token::Tokenizer tokenizer;
@@ -86,6 +86,11 @@ void Wassembler::SetMemorySize(unsigned const size)
 	config.memorySize = size;
 }

+void Wassembler::EnableSubstitutionsLogging()
+{
+	printSubstitutions = true;
+}
+
 bool Wassembler::LoadFromFile(std::string const & filePath)
 {
 	std::vector<std::string> lines;
@@ -95,6 +100,13 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
 		return false;
 	}

+	Preprocessor preprocessor;
+	preprocessor.process(lines);
+	if (printSubstitutions)
+	{
+		preprocessor.printSubstitutions();
+	}
+
 	std::vector<Token::Token> tokens;
 	if (!LoadTokens(lines, tokens))
 	{