Files
wassembly/src/token/tokenizer.cpp

271 lines
5.9 KiB
C++

#include <map>
#include <optional>
#include <stdexcept>
#include <token/errors.hpp>
#include <token/tokenizer.hpp>
#include <utils.hpp>
namespace Token
{
std::optional<int> TryParseInt(std::string const & string)
{
try
{
int value = std::stoi(string);
return std::make_optional<int>(value);
}
catch(std::invalid_argument &)
{
return std::nullopt;
}
}
Token Tokenizer::ExtractToken(std::string string,
int const lineNumber,
int const lineColumn) const
{
if (string.size() == 0)
{
// TODO Should this become an error token?
return Token::CreateEmptyToken(lineNumber, lineColumn);
}
for(std::size_t i = 0; i < substitutions.size(); ++i)
{
if (string == substitutions[i].first)
{
string = substitutions[i].second;
break;
}
}
char const prefix = string[0];
switch(prefix)
{
case '$':
{
auto const result = TryParseInt(string.substr(1, string.size()));
if (result.has_value())
{
return Token::CreateImmediateValueToken(
result.value(),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Immediate cannot be parsed as an integer",
TokenType::ImmediateInteger,
lineNumber,
lineColumn);
}
case '%':
return Token::CreateRegisterToken(GetRegisterType(
string.substr(1, string.size())),
lineNumber,
lineColumn);
case ';':
return Token::CreateStatementEndToken(lineNumber, lineColumn);
default:
break;
}
char const postfix = string[string.size() - 1];
if (postfix == ':')
{
// TODO check if label is an Operand?
return Token::CreateLabelToken(
string.substr(0, string.size() - 1),
lineNumber,
lineColumn);
}
if (prefix == '[' && postfix == ']')
{
if(string.size() < 4)
{
return Token::CreateErrorToken(
"Memory address statement is empty",
TokenType::Memory,
lineNumber,
lineColumn);
}
char const memoryPrefix = string[1];
std::string const valueString = string.substr(2, string.size() - 3u);
if (memoryPrefix == '$')
{
auto const result = TryParseInt(valueString);
if (result.has_value())
{
return Token::CreateMemoryToken(
result.value(),
lineNumber,
lineColumn);
}
return Token::CreateErrorToken(
"Memory immediate address cannot be parsed as an integer",
TokenType::Memory,
lineNumber,
lineColumn);
}
else if (memoryPrefix == '%')
{
return Token::CreateMemoryToken(
GetRegisterType(valueString),
lineNumber,
lineColumn);
}
else
{
return Token::CreateErrorToken(
"Memory immediate address contains an unexpected value",
TokenType::Memory,
lineNumber,
lineColumn + 1u);
}
}
else if (prefix == '[' || postfix == ']')
{
int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u);
return Token::CreateErrorToken(
"Non terminated memory address brackets",
TokenType::Memory,
lineNumber,
errorLineColumn);
}
OperandType const opType = GetOperandType(string);
if (opType != OperandType::Unknown)
{
return Token::CreateOperandToken(opType, lineNumber, lineColumn);
}
// Last resort: it must be a label
return Token::CreateLabelToken(string, lineNumber, lineColumn);
}
// Modifies the lineColumn parameter to point at the character literal end
void Tokenizer::ParseCharacterLiteral(
std::string const & line,
int const lineNumber,
unsigned & lineColumn,
std::vector<Token> & tokens) const
{
for(unsigned int i = lineColumn + 1; i < line.size(); ++i)
{
if (line[i] == '\'')
{
if (lineColumn + 2u != i)
{
tokens.push_back(Token::CreateErrorToken(
"Character literal must be exactly 1 character long between single quotes",
TokenType::ImmediateInteger,
lineNumber,
lineColumn + 1u));
}
else
{
tokens.push_back(Token::CreateImmediateValueToken(
line[i - 1],
lineNumber,
lineColumn + 1));
}
lineColumn = i;
return;
}
}
tokens.push_back(Token::CreateErrorToken(
"Non terminated character literal",
TokenType::ImmediateInteger,
lineNumber,
lineColumn));
lineColumn = line.size();
}
void Tokenizer::Tokenize(
std::string const & line,
int const lineNumber,
std::vector<Token> & tokens)
{
enum class TokenizerState
{
LookForNextToken,
LookForTokenEnd,
};
TokenizerState state = TokenizerState::LookForNextToken;
unsigned columnTokenStart = 0;
for(unsigned column = 0u; column < line.size(); ++column)
{
switch(state)
{
case TokenizerState::LookForNextToken:
if (!Utils::isWhitespaceCharacter(line[column]))
{
if (line[column] == '\'')
{
// TODO integrate this better with the existing extract token
// infrastructure
ParseCharacterLiteral(line, lineNumber, column, tokens);
break;
}
columnTokenStart = column;
switch(line[column])
{
case ';':
tokens.push_back(
ExtractToken(line.substr(column, 1), lineNumber, column));
break;
default:
state = TokenizerState::LookForTokenEnd;
break;
}
}
break;
case TokenizerState::LookForTokenEnd:
if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
{
tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
if (line[column] == ';')
{
tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column));
}
state = TokenizerState::LookForNextToken;
}
break;
default:
std::puts("DEBUG: Unhandled TokenizerState value");
break;
}
}
switch(state)
{
case TokenizerState::LookForTokenEnd:
tokens.push_back(ExtractToken(
line.substr(columnTokenStart, line.size()),
lineNumber,
columnTokenStart));
break;
case TokenizerState::LookForNextToken:
default:
break;
}
}
}