Extract preprocessing from tokenizer

This commit is contained in:
2020-08-29 11:25:10 +02:00
parent aebc1dd86d
commit 71678b2ec6
12 changed files with 254 additions and 134 deletions

View File

@@ -24,8 +24,7 @@ afterwards can be a bit cryptic as to where it originated.
- `[operation][number type]`, e.g. `divi` for divide (div) integer
- `%[register]` for addressing registers
- `$[value]` for using immediate (literal) integer values
- `'a'` for using immediate character values (currently only supports non
escaped characters)
- `'a'` for using immediate character values
- `;` for end of statement (mandatory)
- `[label]:` for labels
- `#[text]` for comments: any text is ignored till a newline (`\n`) is found
@@ -92,13 +91,33 @@ There is currently no strict checking, so be careful.
### Preprocessor
All preprocessor directives are prefixed by a `#`. Ill formed preprocessor
directives do not halt compilation, they are merely reported and then ignored.
directives do not halt compilation, they are merely ignored. All preprocessing
is done in a single pass. Recursion or definition of a directive by another
directive is not supported therefore.
- `DEFINE` replaces any occurrence of the first argument by the second argument.
The second argument may be empty, effectively deleting occurences of argument
one. Quotes are currently not supported and arguments are separated by
whitespace. If multiple defines exist for the same substitution the first
declared is used.
- `DEFINE <x> [y]` replaces any occurrence of the first argument (`x`) by the
second optional argument (`y`). The second argument can be empty, effectively
deleting all occurrences of `x`. Quotes are currently not supported and
arguments are separated by whitespace. If multiple defines exist the later
declarations will overwrite the previous.
### Registers
All registers are 32 bits wide. The following 4 registers currently exist:
- A
- B
- C
- D
### Immediates
An immediate integer value for 42 is for examle `$42`. Negative values are
allowed, for example `$-42`. Notation must be in decimal, hexadecimal and octals
are **not supported**.
The immediate character value for the letter g is `'g'`. Character values must
be a single character, escaped or multi byte characters are **not supported**.
### Operands

View File

@@ -1,4 +1,4 @@
#DEFINE
#DEFINE LETTER_O 'o'
addi $10 $-5 %A;
subi %A $2 %B;
@@ -31,7 +31,7 @@ int PRINT_CHAR;
seti %A $108; # l
int PRINT_CHAR;
int PRINT_CHAR;
seti %A $111; # o
seti %A LETTER_O;
int PRINT_CHAR;
seti %A $32; # space
@@ -105,9 +105,6 @@ seti %B $10;
int $5;
int $3;
# Demonstrate string literals
"Hello world!";
exit;
noop_function:

View File

@@ -0,0 +1,21 @@
#pragma once
#include <string>
#include <vector>
class Preprocessor
{
private:
std::vector<std::string> substitutionIdentifiers;
std::vector<std::string> substitutionValues;
void extractComment(std::string & line,
std::size_t const lineNumber,
std::size_t const lineColumn);
void processLine(std::string & line, std::size_t const lineNumber);
public:
void process(std::vector<std::string> & lines);
void printSubstitutions() const;
};

View File

@@ -15,11 +15,6 @@ namespace Token
int const lineNumber,
int const lineColumn) const;
void ParseComment(
std::string const & string,
int const lineNumber,
int const lineColumn);
void ParseCharacterLiteral(
std::string const & line,
int const lineNumber,

6
include/utils.hpp Normal file
View File

@@ -0,0 +1,6 @@
#pragma once
namespace Utils
{
bool isWhitespaceCharacter(char const c);
}

View File

@@ -9,12 +9,15 @@ class Wassembler
private:
Configuration config;
Execute::VirtualMachine vm;
bool printSubstitutions;
bool LoadLinesFromFile(std::string const & filePath, std::vector<std::string> & lines) const;
bool LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const;
public:
void SetMemorySize(unsigned const size);
void EnableSubstitutionsLogging();
bool LoadFromFile(std::string const & filePath);
void Run();

View File

@@ -13,7 +13,7 @@ BINARY = bin/wassembler
all: ${BINARY}
check: ${BINARY}
./$< ./bin/test.wasm
./$< ./bin/test.wasm -p
clean:
-rm -rf build ./${BINARY}

View File

@@ -7,10 +7,12 @@ int main(int argc, char ** argv)
{
std::string inputFile;
unsigned memorySize = 1024;
bool printSubstitutions = false;
auto cli = (
clipp::value("input wasm file").set(inputFile),
clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize)
clipp::option("-m", "--memory-size") & clipp::value("memory size", memorySize),
clipp::option("-p", "--print-substitutions").set(printSubstitutions)
);
if (!clipp::parse(argc, argv, cli))
@@ -21,6 +23,11 @@ int main(int argc, char ** argv)
Wassembler wassembler;
wassembler.SetMemorySize(memorySize);
if (printSubstitutions)
{
wassembler.EnableSubstitutionsLogging();
}
if (!wassembler.LoadFromFile(inputFile))
{
exit(1);

View File

@@ -0,0 +1,159 @@
#include <preprocessor/preprocessor.hpp>
#include <utils.hpp>
bool trySubstitute(
std::string & line,
std::size_t const lineColumn,
std::vector<std::string> const & substitutionIdentifiers,
std::vector<std::string> const & substitutionValues)
{
for(std::size_t i = 0; i < substitutionIdentifiers.size(); ++i)
{
if (line.compare(lineColumn, substitutionIdentifiers[i].size(), substitutionIdentifiers[i]) != 0)
{
continue;
}
std::string const lineCopy = line;
line = lineCopy.substr(0, lineColumn) + substitutionValues[i];
if (lineCopy.size() > lineColumn + substitutionIdentifiers[i].size())
{
line += lineCopy.substr(lineColumn + substitutionIdentifiers[i].size(), lineCopy.size());
}
return true;
}
return false;
}
void Preprocessor::extractComment(
std::string & line,
std::size_t const lineNumber,
std::size_t const lineColumn)
{
if (line.size() <= lineColumn + 1 ||
line.compare(lineColumn + 1, std::string::npos, "DEFINE") <= 0)
{
// No match or empty DEFINE statement
line = line.substr(0, lineColumn);
}
enum CommentParseState
{
LookForArgumentStart,
LookForArgumentEnd
};
std::string firstArgument, secondArgument;
std::size_t argumentCount = 0, argumentStart = 0;
CommentParseState state = LookForArgumentStart;
for(std::size_t i = lineColumn + 7; i < line.size(); ++i)
{
switch(state)
{
case LookForArgumentStart:
if(!Utils::isWhitespaceCharacter(line[i]))
{
argumentStart = i;
state = CommentParseState::LookForArgumentEnd;
}
break;
case LookForArgumentEnd:
if (Utils::isWhitespaceCharacter(line[i]))
{
switch(argumentCount)
{
case 0:
firstArgument = line.substr(argumentStart, i - argumentStart);
break;
case 1:
secondArgument = line.substr(argumentStart, i - argumentStart);
break;
default:
break;
}
++argumentCount;
state = CommentParseState::LookForArgumentStart;
}
break;
}
}
switch(state)
{
case CommentParseState::LookForArgumentStart:
break;
case CommentParseState::LookForArgumentEnd:
switch(argumentCount)
{
case 0:
firstArgument = line.substr(argumentStart);
break;
case 1:
secondArgument = line.substr(argumentStart);
break;
default:
break;
}
++argumentCount;
break;
}
if (argumentCount > 0)
{
substitutionIdentifiers.push_back(firstArgument);
substitutionValues.push_back(secondArgument);
}
line = line.substr(0, lineColumn);
}
void Preprocessor::processLine(std::string & line, std::size_t const lineNumber)
{
for(std::size_t i = 0; i < line.size(); ++i)
{
if (!Utils::isWhitespaceCharacter(line[i]))
{
if (trySubstitute(line, i, substitutionIdentifiers, substitutionValues))
{
continue;
}
if (line[i] == '#')
{
extractComment(line, lineNumber, i);
return;
}
}
}
}
void Preprocessor::process(std::vector<std::string> & lines)
{
substitutionIdentifiers.clear();
substitutionValues.clear();
for(std::size_t i = 0; i < lines.size(); ++i)
{
processLine(lines[i], i);
}
}
void Preprocessor::printSubstitutions() const
{
for(std::size_t i = 0; i < substitutionIdentifiers.size(); ++i)
{
std::printf(
"%s -> %s\n",
substitutionIdentifiers[i].c_str(),
substitutionValues[i].c_str());
}
}

View File

@@ -3,6 +3,7 @@
#include <stdexcept>
#include <token/errors.hpp>
#include <token/tokenizer.hpp>
#include <utils.hpp>
namespace Token
{
@@ -150,111 +151,6 @@ namespace Token
return Token::CreateLabelToken(string, lineNumber, lineColumn);
}
bool IsWhiteSpace(char const c)
{
return c == '\n' || c == ' ' || c == '\t' || c == '\r';
}
void Tokenizer::ParseComment(
std::string const & string,
int const lineNumber,
int const lineColumn)
{
unsigned const commentContentStart = lineColumn + 1;
if (string.size() < commentContentStart ||
IsWhiteSpace(string[commentContentStart]))
{
return;
}
enum class CommentParseState
{
LookForDirectiveEnd,
LookForArgumentStart,
LookForArgumentEnd
};
std::string firstArgument, secondArgument;
unsigned argumentCount = 0, argumentStart = 0;
CommentParseState state = CommentParseState::LookForDirectiveEnd;
for(unsigned i = commentContentStart + 1; i < string.size(); ++i)
{
switch(state)
{
case CommentParseState::LookForDirectiveEnd:
if(IsWhiteSpace(string[i]))
{
if (string.compare(commentContentStart, i - commentContentStart, "DEFINE"))
{
// Nonzero = not equal
return;
}
state = CommentParseState::LookForArgumentStart;
}
break;
case CommentParseState::LookForArgumentStart:
if(!IsWhiteSpace(string[i]))
{
argumentStart = i;
state = CommentParseState::LookForArgumentEnd;
}
break;
case CommentParseState::LookForArgumentEnd:
if (IsWhiteSpace(string[i]))
{
state = CommentParseState::LookForArgumentStart;
switch(argumentCount)
{
case 0:
firstArgument = string.substr(argumentStart, i - argumentStart);
break;
case 1:
secondArgument = string.substr(argumentStart, i - argumentStart);
break;
default:
goto end_state_loop;
}
++argumentCount;
}
break;
}
}
end_state_loop:
switch(state)
{
case CommentParseState::LookForDirectiveEnd:
case CommentParseState::LookForArgumentStart:
break;
case CommentParseState::LookForArgumentEnd:
switch(argumentCount)
{
case 0:
firstArgument = string.substr(argumentStart);
break;
case 1:
secondArgument = string.substr(argumentStart);
break;
}
++argumentCount;
break;
}
if (argumentCount > 0)
{
substitutions.push_back(std::make_pair(firstArgument, secondArgument));
}
else
{
std::printf("WARNING: DEFINE with no arguments on line %u\n", lineNumber + 1);
}
}
// Modifies the lineColumn parameter to point at the character literal end
void Tokenizer::ParseCharacterLiteral(
std::string const & line,
@@ -314,16 +210,12 @@ end_state_loop:
switch(state)
{
case TokenizerState::LookForNextToken:
if (!IsWhiteSpace(line[column]))
if (!Utils::isWhitespaceCharacter(line[column]))
{
if (line[column] == '#')
{
ParseComment(line, lineNumber, column);
return;
}
if (line[column] == '\'')
{
// TODO integrate this better with the existing extract token
// infrastructure
ParseCharacterLiteral(line, lineNumber, column, tokens);
break;
}
@@ -345,7 +237,7 @@ end_state_loop:
break;
case TokenizerState::LookForTokenEnd:
if (IsWhiteSpace(line[column]) || line[column] == ';')
if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
{
tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
if (line[column] == ';')

9
src/utils.cpp Normal file
View File

@@ -0,0 +1,9 @@
#include <utils.hpp>
namespace Utils
{
bool isWhitespaceCharacter(char const c)
{
return c == '\n' || c == ' ' || c == '\t' || c == '\r';
}
}

View File

@@ -1,5 +1,6 @@
#include <fstream>
#include <interpret/errors.hpp>
#include <preprocessor/preprocessor.hpp>
#include <token/errors.hpp>
#include <wassembler.hpp>
@@ -48,7 +49,6 @@ bool Wassembler::LoadLinesFromFile(std::string const & filePath, std::vector<std
return true;
}
bool Wassembler::LoadTokens(std::vector<std::string> const & lines, std::vector<Token::Token> & tokens) const
{
Token::Tokenizer tokenizer;
@@ -86,6 +86,11 @@ void Wassembler::SetMemorySize(unsigned const size)
config.memorySize = size;
}
void Wassembler::EnableSubstitutionsLogging()
{
printSubstitutions = true;
}
bool Wassembler::LoadFromFile(std::string const & filePath)
{
std::vector<std::string> lines;
@@ -95,6 +100,13 @@ bool Wassembler::LoadFromFile(std::string const & filePath)
return false;
}
Preprocessor preprocessor;
preprocessor.process(lines);
if (printSubstitutions)
{
preprocessor.printSubstitutions();
}
std::vector<Token::Token> tokens;
if (!LoadTokens(lines, tokens))
{