Extract preprocessing from tokenizer

2020-08-29 11:25:10 +02:00
parent aebc1dd86d
commit 71678b2ec6
12 changed files with 254 additions and 134 deletions
--- a/README.md
+++ b/README.md
@@ -24,8 +24,7 @@ afterwards can be a bit cryptic as to where it originated.
 - `[operation][number type]`, e.g. `divi` for divide (div) integer
 - `%[register]` for addressing registers
 - `$[value]` for using immediate (literal) integer values
- `'a'` for using immediate character values (currently only supports non
+- `'a'` for using immediate character values
 escaped characters)
 - `;` for end of statement (mandatory)
 - `[label]:` for labels
 - `#[text]` for comments: any text is ignored till a newline (`\n`) is found
@@ -92,13 +91,33 @@ There is currently no strict checking, so be careful.
 ### Preprocessor
 All preprocessor directives are prefixed by a `#`. Ill formed preprocessor
-directives do not halt compilation, they are merely reported and then ignored.
+directives do not halt compilation, they are merely ignored. All preprocessing
 is done in a single pass. Recursion or definition of a directive by another
 directive is not supported therefore.
- `DEFINE` replaces any occurrence of the first argument by the second argument.
+- `DEFINE <x> [y]` replaces any occurrence of the first argument (`x`) by the
-	The second argument may be empty, effectively deleting occurences of argument
+	second optional argument (`y`). The second argument can be empty, effectively
-	one. Quotes are currently not supported and arguments are separated by
+	deleting all occurrences of `x`. Quotes are currently not supported and
-	whitespace. If multiple defines exist for the same substitution the first
+	arguments are separated by whitespace. If multiple defines exist the later
-	declared is used.
+	declarations will overwrite the previous.
 ### Registers
 All registers are 32 bits wide. The following 4 registers currently exist:
 - A
 - B
 - C
 - D
 ### Immediates
 An immediate integer value for 42 is for examle `$42`. Negative values are
 allowed, for example `$-42`. Notation must be in decimal, hexadecimal and octals
 are **not supported**.
 The immediate character value for the letter g is `'g'`. Character values must
 be a single character, escaped or multi byte characters are **not supported**.
 ### Operands
--- a/bin/test.wasm
+++ b/bin/test.wasm
@@ -1,4 +1,4 @@
-#DEFINE
+#DEFINE LETTER_O 'o'
 addi $10 $-5 %A;
 subi %A $2 %B;
@@ -31,7 +31,7 @@ int PRINT_CHAR;
 seti %A $108; # l
 int PRINT_CHAR;
 int PRINT_CHAR;
-seti %A $111; # o
+seti %A LETTER_O;
 int PRINT_CHAR;
 seti %A $32; # space
@@ -105,9 +105,6 @@ seti %B $10;
 int $5;
 int $3;
 # Demonstrate string literals
 "Hello world!";
 exit;
 noop_function:
--- a/include/preprocessor/preprocessor.hpp
+++ b/include/preprocessor/preprocessor.hpp
@@ -0,0 +1,21 @@
 #pragma once
 #include <string>
 #include <vector>
 class Preprocessor
 {
 private:
 	std::vector<std::string> substitutionIdentifiers;
 	std::vector<std::string> substitutionValues;
 	void extractComment(std::string & line,
 		std::size_t const lineNumber,
 		std::size_t const lineColumn);
 	void processLine(std::string & line, std::size_t const lineNumber);
 public:
 	void process(std::vector<std::string> & lines);
 	void printSubstitutions() const;
 };
--- a/include/token/tokenizer.hpp
+++ b/include/token/tokenizer.hpp
@@ -15,11 +15,6 @@ namespace Token
 			int const lineNumber,
 			int const lineColumn) const;
 		void ParseComment(
 			std::string const & string,
 			int const lineNumber,
 			int const lineColumn);
 		void ParseCharacterLiteral(
 			std::string const & line,
 			int const lineNumber,
--- a/include/utils.hpp
+++ b/include/utils.hpp
@@ -0,0 +1,6 @@
 #pragma once
 namespace Utils
 {
 	bool isWhitespaceCharacter(char const c);
 }
--- a/include/wassembler.hpp
+++ b/include/wassembler.hpp
@@ -9,12 +9,15 @@ class Wassembler
 private:
 	Configuration config;
 	Execute::VirtualMachine vm;
 	bool printSubstitutions;
 	bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const;
 	bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const;
 public:
 	void SetMemorySize(unsigned const size);
 	void EnableSubstitutionsLogging();
 	bool LoadFromFile(std::string const & filePath);
 	void Run();
--- a/2
+++ b/2
@@ -13,7 +13,7 @@ BINARY = bin/wassembler
 all: ${BINARY}
 check: ${BINARY}
-	./$< ./bin/test.wasm
+	./$< ./bin/test.wasm -p
 clean:
 	-rm -rf build ./${BINARY}
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,10 +7,12 @@ int main(int argc, char ** argv)
 {
 	std::string inputFile;
 	unsigned memorySize = 1024;
 	bool printSubstitutions = false;
 	auto cli = (
 		clipp::value("input wasm file").set(inputFile),
-		clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize)
+		clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
 		clipp::option("-p", "--print-substitutions").set(printSubstitutions)
 	);
 	if (!clipp::parse(argc, argv, cli))
@@ -21,6 +23,11 @@ int main(int argc, char ** argv)
 	Wassembler wassembler;
 	wassembler.SetMemorySize(memorySize);
 	if (printSubstitutions)
 	{
 		wassembler.EnableSubstitutionsLogging();
 	}
 	if (!wassembler.LoadFromFile(inputFile))
 	{
 		exit(1);
--- a/src/preprocessor/preprocessor.cpp
+++ b/src/preprocessor/preprocessor.cpp
@@ -0,0 +1,159 @@
 #include <preprocessor/preprocessor.hpp>
 #include <utils.hpp>
 bool trySubstitute(
 	std::string & line,
 	std::size_t const lineColumn,
 	std::vector<std::string> const & substitutionIdentifiers,
 	std::vector<std::string> const & substitutionValues)
 {
 	for(std::size_t i = 0; i < substitutionIdentifiers.size(); ++i)
 	{
 		if (line.compare(lineColumn, substitutionIdentifiers[i].size(), substitutionIdentifiers[i]) != 0)
 		{
 			continue;
 		}
 		std::string const lineCopy = line;
 		line = lineCopy.substr(0, lineColumn) + substitutionValues[i];
 		if (lineCopy.size() > lineColumn + substitutionIdentifiers[i].size())
 		{
 			line += lineCopy.substr(lineColumn + substitutionIdentifiers[i].size(), lineCopy.size());
 		}
 		return true;
 	}
 	return false;
 }
 void Preprocessor::extractComment(
 	std::string & line,
 	std::size_t const lineNumber,
 	std::size_t const lineColumn)
 {
 	if (line.size() <= lineColumn + 1 ||
 		line.compare(lineColumn + 1, std::string::npos, "DEFINE") <= 0)
 	{
 		// No match or empty DEFINE statement
 		line = line.substr(0, lineColumn);
 	}
 	enum CommentParseState
 	{
 		LookForArgumentStart,
 		LookForArgumentEnd
 	};
 	std::string firstArgument, secondArgument;
 	std::size_t argumentCount = 0, argumentStart = 0;
 	CommentParseState state = LookForArgumentStart;
 	for(std::size_t i = lineColumn + 7; i < line.size(); ++i)
 	{
 		switch(state)
 		{
 			case LookForArgumentStart:
 			if(!Utils::isWhitespaceCharacter(line[i]))
 			{
 				argumentStart = i;
 				state = CommentParseState::LookForArgumentEnd;
 			}
 			break;
 			case LookForArgumentEnd:
 			if (Utils::isWhitespaceCharacter(line[i]))
 			{
 				switch(argumentCount)
 				{
 					case 0:
 					firstArgument = line.substr(argumentStart, i - argumentStart);
 					break;
 					case 1:
 					secondArgument = line.substr(argumentStart, i - argumentStart);
 					break;
 					default:
 					break;
 				}
 				++argumentCount;
 				state = CommentParseState::LookForArgumentStart;
 			}
 			break;
 		}
 	}
 	switch(state)
 	{
 		case CommentParseState::LookForArgumentStart:
 		break;
 		case CommentParseState::LookForArgumentEnd:
 		switch(argumentCount)
 		{
 			case 0:
 			firstArgument = line.substr(argumentStart);
 			break;
 			case 1:
 			secondArgument = line.substr(argumentStart);
 			break;
 			default:
 			break;
 		}
 		++argumentCount;
 		break;
 	}
 	if (argumentCount > 0)
 	{
 		substitutionIdentifiers.push_back(firstArgument);
 		substitutionValues.push_back(secondArgument);
 	}
 	line = line.substr(0, lineColumn);
 }
 void Preprocessor::processLine(std::string & line, std::size_t const lineNumber)
 {
 	for(std::size_t i = 0; i < line.size(); ++i)
 	{
 		if (!Utils::isWhitespaceCharacter(line[i]))
 		{
 			if (trySubstitute(line, i, substitutionIdentifiers, substitutionValues))
 			{
 				continue;
 			}
 			if (line[i] == '#')
 			{
 				extractComment(line, lineNumber, i);
 				return;
 			}
 		}
 	}
 }
 void Preprocessor::process(std::vector<std::string> & lines)
 {
 	substitutionIdentifiers.clear();
 	substitutionValues.clear();
 	for(std::size_t i = 0; i < lines.size(); ++i)
 	{
 		processLine(lines[i], i);
 	}
 }
 void Preprocessor::printSubstitutions() const
 {
 	for(std::size_t i = 0; i < substitutionIdentifiers.size(); ++i)
 	{
 		std::printf(
 			"%s -> %s\n",
 			substitutionIdentifiers[i].c_str(),
 			substitutionValues[i].c_str());
 	}
 }
--- a/src/token/tokenizer.cpp
+++ b/src/token/tokenizer.cpp
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <token/errors.hpp>
 #include <token/tokenizer.hpp>
 #include <utils.hpp>
 namespace Token
 {
@@ -150,111 +151,6 @@ namespace Token
 		return Token::CreateLabelToken(string, lineNumber, lineColumn);
 	}
 	bool IsWhiteSpace(char const c)
 	{
 		return c == '\n' || c == ' ' || c == '\t' || c == '\r';
 	}
 	void Tokenizer::ParseComment(
 		std::string const & string,
 		int const lineNumber,
 		int const lineColumn)
 	{
 		unsigned const commentContentStart = lineColumn + 1;
 		if (string.size() < commentContentStart ||
 			IsWhiteSpace(string[commentContentStart]))
 		{
 			return;
 		}
 		enum class CommentParseState
 		{
 			LookForDirectiveEnd,
 			LookForArgumentStart,
 			LookForArgumentEnd
 		};
 		std::string firstArgument, secondArgument;
 		unsigned argumentCount = 0, argumentStart = 0;
 		CommentParseState state = CommentParseState::LookForDirectiveEnd;
 		for(unsigned i = commentContentStart + 1; i < string.size(); ++i)
 		{
 			switch(state)
 			{
 				case CommentParseState::LookForDirectiveEnd:
 				if(IsWhiteSpace(string[i]))
 				{
 					if (string.compare(commentContentStart, i - commentContentStart, "DEFINE"))
 					{
 						// Nonzero = not equal
 						return;
 					}
 					state = CommentParseState::LookForArgumentStart;
 				}
 				break;
 				case CommentParseState::LookForArgumentStart:
 				if(!IsWhiteSpace(string[i]))
 				{
 					argumentStart = i;
 					state = CommentParseState::LookForArgumentEnd;
 				}
 				break;
 				case CommentParseState::LookForArgumentEnd:
 				if (IsWhiteSpace(string[i]))
 				{
 					state = CommentParseState::LookForArgumentStart;
 					switch(argumentCount)
 					{
 						case 0:
 						firstArgument = string.substr(argumentStart, i - argumentStart);
 						break;
 						case 1:
 						secondArgument = string.substr(argumentStart, i - argumentStart);
 						break;
 						default:
 						goto end_state_loop;
 					}
 					++argumentCount;
 				}
 				break;
 			}
 		}
 end_state_loop:
 		switch(state)
 		{
 			case CommentParseState::LookForDirectiveEnd:
 			case CommentParseState::LookForArgumentStart:
 			break;
 			case CommentParseState::LookForArgumentEnd:
 			switch(argumentCount)
 			{
 				case 0:
 				firstArgument = string.substr(argumentStart);
 				break;
 				case 1:
 				secondArgument = string.substr(argumentStart);
 				break;
 			}
 			++argumentCount;
 			break;
 		}
 		if (argumentCount > 0)
 		{
 			substitutions.push_back(std::make_pair(firstArgument, secondArgument));
 		}
 		else
 		{
 			std::printf("WARNING: DEFINE with no arguments on line %u\n", lineNumber + 1);
 		}
 	}
 	// Modifies the lineColumn parameter to point at the character literal end
 	void Tokenizer::ParseCharacterLiteral(
 		std::string const & line,
@@ -314,16 +210,12 @@ end_state_loop:
 			switch(state)
 			{
 				case TokenizerState::LookForNextToken:
-				if (!IsWhiteSpace(line[column]))
+				if (!Utils::isWhitespaceCharacter(line[column]))
 				{
 					if (line[column] == '#')
 					{
 						ParseComment(line, lineNumber, column);
 						return;
 					}
 					if (line[column] == '\'')
 					{
 						// TODO integrate this better with the existing extract token
 						// infrastructure
 						ParseCharacterLiteral(line, lineNumber, column, tokens);
 						break;
 					}
@@ -345,7 +237,7 @@ end_state_loop:
 				break;
 				case TokenizerState::LookForTokenEnd:
-				if (IsWhiteSpace(line[column]) || line[column] == ';')
+				if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
 				{
 					tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
 					if (line[column] == ';')
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -0,0 +1,9 @@
 #include <utils.hpp>
 namespace Utils
 {
 	bool isWhitespaceCharacter(char const c)
 	{
 		return c == '\n' || c == ' ' || c == '\t' || c == '\r';
 	}
 }
--- a/src/wassembler.cpp
+++ b/src/wassembler.cpp
@@ -1,5 +1,6 @@
 #include <fstream>
 #include <interpret/errors.hpp>
 #include <preprocessor/preprocessor.hpp>
 #include <token/errors.hpp>
 #include <wassembler.hpp>
@@ -48,7 +49,6 @@ bool Wassembler::LoadLinesFromFile(std::string const & filePath, std::vector<std
 	return true;
 }
 bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const
 {
 	Token::Tokenizer tokenizer;
@@ -86,6 +86,11 @@ void Wassembler::SetMemorySize(unsigned const size)
 	config.memorySize = size;
 }
 void Wassembler::EnableSubstitutionsLogging()
 {
 	printSubstitutions = true;
 }
 bool Wassembler::LoadFromFile(std::string const & filePath)
 {
 	std::vector<std::string> lines;
@@ -95,6 +100,13 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
 		return false;
 	}
 	Preprocessor preprocessor;
 	preprocessor.process(lines);
 	if (printSubstitutions)
 	{
 		preprocessor.printSubstitutions();
 	}
 	std::vector<Token::Token> tokens;
 	if (!LoadTokens(lines, tokens))
 	{