271 lines
5.9 KiB
C++
271 lines
5.9 KiB
C++
#include <map>
|
|
#include <optional>
|
|
#include <stdexcept>
|
|
#include <token/errors.hpp>
|
|
#include <token/tokenizer.hpp>
|
|
#include <utils.hpp>
|
|
|
|
namespace Token
|
|
{
|
|
std::optional<int> TryParseInt(std::string const & string)
|
|
{
|
|
try
|
|
{
|
|
int value = std::stoi(string);
|
|
return std::make_optional<int>(value);
|
|
}
|
|
catch(std::invalid_argument &)
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
|
|
Token Tokenizer::ExtractToken(std::string string,
|
|
int const lineNumber,
|
|
int const lineColumn) const
|
|
{
|
|
if (string.size() == 0)
|
|
{
|
|
// TODO Should this become an error token?
|
|
return Token::CreateEmptyToken(lineNumber, lineColumn);
|
|
}
|
|
|
|
for(std::size_t i = 0; i < substitutions.size(); ++i)
|
|
{
|
|
if (string == substitutions[i].first)
|
|
{
|
|
string = substitutions[i].second;
|
|
break;
|
|
}
|
|
}
|
|
|
|
char const prefix = string[0];
|
|
switch(prefix)
|
|
{
|
|
case '$':
|
|
{
|
|
auto const result = TryParseInt(string.substr(1, string.size()));
|
|
|
|
if (result.has_value())
|
|
{
|
|
return Token::CreateImmediateValueToken(
|
|
result.value(),
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
|
|
return Token::CreateErrorToken(
|
|
"Immediate cannot be parsed as an integer",
|
|
TokenType::ImmediateInteger,
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
|
|
case '%':
|
|
return Token::CreateRegisterToken(GetRegisterType(
|
|
string.substr(1, string.size())),
|
|
lineNumber,
|
|
lineColumn);
|
|
|
|
case ';':
|
|
return Token::CreateStatementEndToken(lineNumber, lineColumn);
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
char const postfix = string[string.size() - 1];
|
|
if (postfix == ':')
|
|
{
|
|
// TODO check if label is an Operand?
|
|
return Token::CreateLabelToken(
|
|
string.substr(0, string.size() - 1),
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
|
|
if (prefix == '[' && postfix == ']')
|
|
{
|
|
if(string.size() < 4)
|
|
{
|
|
return Token::CreateErrorToken(
|
|
"Memory address statement is empty",
|
|
TokenType::Memory,
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
|
|
char const memoryPrefix = string[1];
|
|
std::string const valueString = string.substr(2, string.size() - 3u);
|
|
if (memoryPrefix == '$')
|
|
{
|
|
auto const result = TryParseInt(valueString);
|
|
|
|
if (result.has_value())
|
|
{
|
|
return Token::CreateMemoryToken(
|
|
result.value(),
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
|
|
return Token::CreateErrorToken(
|
|
"Memory immediate address cannot be parsed as an integer",
|
|
TokenType::Memory,
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
else if (memoryPrefix == '%')
|
|
{
|
|
return Token::CreateMemoryToken(
|
|
GetRegisterType(valueString),
|
|
lineNumber,
|
|
lineColumn);
|
|
}
|
|
else
|
|
{
|
|
return Token::CreateErrorToken(
|
|
"Memory immediate address contains an unexpected value",
|
|
TokenType::Memory,
|
|
lineNumber,
|
|
lineColumn + 1u);
|
|
}
|
|
}
|
|
else if (prefix == '[' || postfix == ']')
|
|
{
|
|
int const errorLineColumn = (prefix == '[') ? lineColumn : (lineColumn + string.size() - 1u);
|
|
return Token::CreateErrorToken(
|
|
"Non terminated memory address brackets",
|
|
TokenType::Memory,
|
|
lineNumber,
|
|
errorLineColumn);
|
|
}
|
|
|
|
OperandType const opType = GetOperandType(string);
|
|
if (opType != OperandType::Unknown)
|
|
{
|
|
return Token::CreateOperandToken(opType, lineNumber, lineColumn);
|
|
}
|
|
|
|
// Last resort: it must be a label
|
|
return Token::CreateLabelToken(string, lineNumber, lineColumn);
|
|
}
|
|
|
|
// Modifies the lineColumn parameter to point at the character literal end
|
|
void Tokenizer::ParseCharacterLiteral(
|
|
std::string const & line,
|
|
int const lineNumber,
|
|
unsigned & lineColumn,
|
|
std::vector<Token> & tokens) const
|
|
{
|
|
for(unsigned int i = lineColumn + 1; i < line.size(); ++i)
|
|
{
|
|
if (line[i] == '\'')
|
|
{
|
|
if (lineColumn + 2u != i)
|
|
{
|
|
tokens.push_back(Token::CreateErrorToken(
|
|
"Character literal must be exactly 1 character long between single quotes",
|
|
TokenType::ImmediateInteger,
|
|
lineNumber,
|
|
lineColumn + 1u));
|
|
}
|
|
else
|
|
{
|
|
tokens.push_back(Token::CreateImmediateValueToken(
|
|
line[i - 1],
|
|
lineNumber,
|
|
lineColumn + 1));
|
|
}
|
|
|
|
lineColumn = i;
|
|
return;
|
|
}
|
|
}
|
|
|
|
tokens.push_back(Token::CreateErrorToken(
|
|
"Non terminated character literal",
|
|
TokenType::ImmediateInteger,
|
|
lineNumber,
|
|
lineColumn));
|
|
|
|
lineColumn = line.size();
|
|
}
|
|
|
|
void Tokenizer::Tokenize(
|
|
std::string const & line,
|
|
int const lineNumber,
|
|
std::vector<Token> & tokens)
|
|
{
|
|
enum class TokenizerState
|
|
{
|
|
LookForNextToken,
|
|
LookForTokenEnd,
|
|
};
|
|
|
|
TokenizerState state = TokenizerState::LookForNextToken;
|
|
unsigned columnTokenStart = 0;
|
|
for(unsigned column = 0u; column < line.size(); ++column)
|
|
{
|
|
switch(state)
|
|
{
|
|
case TokenizerState::LookForNextToken:
|
|
if (!Utils::isWhitespaceCharacter(line[column]))
|
|
{
|
|
if (line[column] == '\'')
|
|
{
|
|
// TODO integrate this better with the existing extract token
|
|
// infrastructure
|
|
ParseCharacterLiteral(line, lineNumber, column, tokens);
|
|
break;
|
|
}
|
|
|
|
columnTokenStart = column;
|
|
|
|
switch(line[column])
|
|
{
|
|
case ';':
|
|
tokens.push_back(
|
|
ExtractToken(line.substr(column, 1), lineNumber, column));
|
|
break;
|
|
|
|
default:
|
|
state = TokenizerState::LookForTokenEnd;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case TokenizerState::LookForTokenEnd:
|
|
if (Utils::isWhitespaceCharacter(line[column]) || line[column] == ';')
|
|
{
|
|
tokens.push_back(ExtractToken(line.substr(columnTokenStart, column - columnTokenStart), lineNumber, columnTokenStart));
|
|
if (line[column] == ';')
|
|
{
|
|
tokens.push_back(ExtractToken(line.substr(column, 1), lineNumber, column));
|
|
}
|
|
state = TokenizerState::LookForNextToken;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
std::puts("DEBUG: Unhandled TokenizerState value");
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch(state)
|
|
{
|
|
case TokenizerState::LookForTokenEnd:
|
|
tokens.push_back(ExtractToken(
|
|
line.substr(columnTokenStart, line.size()),
|
|
lineNumber,
|
|
columnTokenStart));
|
|
break;
|
|
|
|
case TokenizerState::LookForNextToken:
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
} |