From 9e9039af05bc831d09975c1c564f2c36d9f19c9f Mon Sep 17 00:00:00 2001 From: zanostro Date: Wed, 10 Dec 2025 18:02:06 +0100 Subject: [PATCH] working AST --- .gitignore | 2 + simulator_SIC_XE/CMakeLists.txt | 6 +- simulator_SIC_XE/gui/qt/mainwindow.cpp | 2 +- simulator_SIC_XE/include/lexer.h | 55 +++ simulator_SIC_XE/include/mnemonic.h | 41 ++- simulator_SIC_XE/include/node.h | 87 ++++- simulator_SIC_XE/include/opcode.h | 10 + simulator_SIC_XE/include/parser.h | 52 +++ simulator_SIC_XE/src/lexer.cpp | 138 ++++++++ simulator_SIC_XE/src/mnemonic.cpp | 6 - simulator_SIC_XE/src/node.cpp | 122 ++++++- simulator_SIC_XE/src/opcode.cpp | 28 ++ simulator_SIC_XE/src/parser.cpp | 449 +++++++++++++++++++++++++ 13 files changed, 962 insertions(+), 36 deletions(-) create mode 100644 simulator_SIC_XE/include/lexer.h create mode 100644 simulator_SIC_XE/include/parser.h create mode 100644 simulator_SIC_XE/src/lexer.cpp delete mode 100644 simulator_SIC_XE/src/mnemonic.cpp create mode 100644 simulator_SIC_XE/src/parser.cpp diff --git a/.gitignore b/.gitignore index dbda62e..6a3fb13 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ __pycache__/ autotester sictools.jar + +/build/ diff --git a/simulator_SIC_XE/CMakeLists.txt b/simulator_SIC_XE/CMakeLists.txt index c6c37e1..e06a422 100644 --- a/simulator_SIC_XE/CMakeLists.txt +++ b/simulator_SIC_XE/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.10) project(simulator_SIC_XE VERSION 1.0 LANGUAGES CXX) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) # Put all build outputs under target/bin @@ -13,6 +13,10 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${OUTPUT_DIR}) # Collect all .cpp sources under src/ file(GLOB_RECURSE SOURCES "${PROJECT_SOURCE_DIR}/src/*.cpp") + +set(MAIN_SRC "${PROJECT_SOURCE_DIR}/src/main.cpp") +list(REMOVE_ITEM SOURCES ${MAIN_SRC}) + if(NOT SOURCES) message(WARNING "No source files found in ${PROJECT_SOURCE_DIR}/src — the build will create an empty library") endif() diff --git a/simulator_SIC_XE/gui/qt/mainwindow.cpp b/simulator_SIC_XE/gui/qt/mainwindow.cpp index b53aa72..b3d7525 100644 --- a/simulator_SIC_XE/gui/qt/mainwindow.cpp +++ b/simulator_SIC_XE/gui/qt/mainwindow.cpp @@ -5,7 +5,7 @@ #include "../../include/instructions.h" #include "../../include/opcode.h" #include "../../include/constants.h" -#include "../../../include/loader.h" +#include "../../include/loader.h" #include #include diff --git a/simulator_SIC_XE/include/lexer.h b/simulator_SIC_XE/include/lexer.h new file mode 100644 index 0000000..f9bdd70 --- /dev/null +++ b/simulator_SIC_XE/include/lexer.h @@ -0,0 +1,55 @@ +#ifndef LEXER_H +#define LEXER_H + +#include +#include +#include + +class SyntaxError : public std::runtime_error { +public: + int row; + int col; + + SyntaxError(const std::string& msg, int row_, int col_) + : std::runtime_error(msg), row(row_), col(col_) {} +}; + + +class Lexer { +public: + int row; + int col; + + explicit Lexer(std::string input); + + Lexer& mark(); + + std::string extract(int ofs); + std::string extract(); + + char peek(int ahead) const; + char peek() const; + + char advance(); + + bool advanceIf(char ch); + void advance(char ch); + + + bool skipWhitespace(); + + std::string readTo(char delimiter); + + std::string readAlphanumeric(); + + std::string readDigits(int radix); + +private: + std::string input_; + std::size_t pos_; + std::size_t start_; + + static int digitValue(char c, int radix); +}; + +#endif // LEXER_H diff --git a/simulator_SIC_XE/include/mnemonic.h b/simulator_SIC_XE/include/mnemonic.h index b144dfb..3cdddab 100644 --- a/simulator_SIC_XE/include/mnemonic.h +++ b/simulator_SIC_XE/include/mnemonic.h @@ -1,16 +1,45 @@ +// mnemonic.h #ifndef MNEMONIC_H #define MNEMONIC_H +#include #include +#include +#include -using std::string; +#include "opcode.h" + +struct Empty {}; +struct Register { int num; }; +struct Immediate { int value; }; +struct SymbolRef { + std::string name; + bool indexed = false; + bool immediate = false; + bool indirect = false; +}; + +using Operand = std::variant; class Mnemonic { public: - string toString() const; + Mnemonic(std::uint8_t opcode, InstructionType type, bool extended) + : _opcode(opcode), _extended(extended), _type(type) {} + + std::uint8_t opcode() const { return _opcode; } + bool extended() const { return _extended; } + InstructionType type() const { return _type; } + + std::vector& operands() { return _operands; } + const std::vector& operands() const { return _operands; } + + std::string toString() const; + +private: + std::uint8_t _opcode; + bool _extended; + InstructionType _type; + std::vector _operands; }; - - - -#endif // MNEMONIC_H \ No newline at end of file +#endif // MNEMONIC_H diff --git a/simulator_SIC_XE/include/node.h b/simulator_SIC_XE/include/node.h index 2152f97..2ad0d86 100644 --- a/simulator_SIC_XE/include/node.h +++ b/simulator_SIC_XE/include/node.h @@ -3,23 +3,22 @@ #include #include - +#include +#include +#include #include "mnemonic.h" using std::string; class Node { public: + virtual ~Node() = default; - string getLabel() const; - - string getComment() const; - - std::shared_ptr getMnemonic() const; - - string toString() const; - + string getLabel() const { return _label; } + string getComment() const { return _comment; } + std::shared_ptr getMnemonic() const { return _mnemonic; } + virtual string toString() const; protected: string _label; @@ -27,5 +26,73 @@ protected: string _comment; }; +class InstructionNode : public Node { +public: + InstructionNode(string label, + std::shared_ptr mnemonic, + string comment) { + _label = std::move(label); + _mnemonic = std::move(mnemonic); + _comment = std::move(comment); + } -#endif // NODE_H \ No newline at end of file + string toString() const override; +}; + +class CommentNode : public Node { +public: + explicit CommentNode(string text) { + _comment = std::move(text); + } + + string toString() const override; +}; + +enum class DirectiveKind { + START, END, BASE, NOBASE, EQU, ORG, LTORG, + EXTDEF, EXTREF, CSECT +}; + +using DirectiveArg = std::variant>; + +class DirectiveNode : public Node { +public: + DirectiveNode(string label, DirectiveKind kind, DirectiveArg arg, string comment) + : _kind(kind), _arg(std::move(arg)) { + _label = std::move(label); + _comment = std::move(comment); + } + + DirectiveKind kind() const { return _kind; } + const DirectiveArg& arg() const { return _arg; } + + string toString() const override; + +private: + DirectiveKind _kind; + DirectiveArg _arg; +}; + +enum class DataKind { WORD, BYTE, RESW, RESB }; + +using DataValue = std::variant>; + +class DataNode : public Node { +public: + DataNode(string label, DataKind kind, DataValue value, string comment) + : _kind(kind), _value(std::move(value)) { + _label = std::move(label); + _comment = std::move(comment); + } + + DataKind kind() const { return _kind; } + const DataValue& value() const { return _value; } + + string toString() const override; + +private: + DataKind _kind; + DataValue _value; +}; + +#endif // NODE_H diff --git a/simulator_SIC_XE/include/opcode.h b/simulator_SIC_XE/include/opcode.h index a467488..1034473 100644 --- a/simulator_SIC_XE/include/opcode.h +++ b/simulator_SIC_XE/include/opcode.h @@ -3,6 +3,10 @@ #include "utils.h" +#include +#include +#include + // ============================== // Opcode definitions (SIC/XE) // ============================== @@ -87,6 +91,8 @@ #define LDVS 0x68 #define LDVT 0x04 +static std::unordered_map mnemonicToOpcode; +static bool opcodeTablesInitialized = false; enum class InstructionType { @@ -110,6 +116,10 @@ struct InstructionInfo { extern InstructionInfo instructions[]; extern InstructionInfo instructionsEXEX[]; +extern std::optional findOpcodeByMnemonic(std::string_view name); +extern const InstructionInfo& getInstructionInfo(uint8_t opcode); + + // Initialize the instruction table void loadInstructionSet(); diff --git a/simulator_SIC_XE/include/parser.h b/simulator_SIC_XE/include/parser.h new file mode 100644 index 0000000..8ae126a --- /dev/null +++ b/simulator_SIC_XE/include/parser.h @@ -0,0 +1,52 @@ +// parser.h +#ifndef PARSER_H +#define PARSER_H + +#include +#include +#include +#include +#include + +#include "lexer.h" +#include "code.h" +#include "opcode.h" +#include "mnemonic.h" + +class Parser { +public: + Parser() = default; + + Code parse(const std::string& input); + +private: + std::string parseLabel(); + std::shared_ptr parseMnemonic(); + std::string parseSymbol(); + int parseRegister(); + void parseComma(); + bool parseIndexed(); + int parseNumber(int lo, int hi); + std::vector parseData(); + + void parseOperands(Mnemonic& m); + + bool isDirective(const std::string& name); + bool isDataDirective(const std::string& name); + std::shared_ptr parseDirective(const std::string& label, const std::string& directive); + std::shared_ptr parseDataDirective(const std::string& label, const std::string& directive); + + std::shared_ptr parseInstruction(); + Code parseCode(); + + std::shared_ptr makeMnemonic(const std::string& name, bool extended); + static void initMnemonicMap(); + +private: + Lexer lexer_{""}; + + static inline std::unordered_map s_nameToOpcode{}; + static inline bool s_mnemonicMapInitialized = false; +}; + +#endif // PARSER_H diff --git a/simulator_SIC_XE/src/lexer.cpp b/simulator_SIC_XE/src/lexer.cpp new file mode 100644 index 0000000..7ac9344 --- /dev/null +++ b/simulator_SIC_XE/src/lexer.cpp @@ -0,0 +1,138 @@ +#include "lexer.h" +#include +#include + +Lexer::Lexer(std::string input) + : input_(std::move(input)), + pos_(0), + start_(0), + row(1), + col(1) +{ +} + +Lexer& Lexer::mark() { + start_ = pos_; + return *this; +} + +std::string Lexer::extract(int ofs) { + std::size_t end = pos_ + static_cast(ofs); + if (end > input_.size()) { + end = input_.size(); + } + if (end < start_) { + end = start_; + } + return input_.substr(start_, end - start_); +} + +std::string Lexer::extract() { + return extract(0); +} + +char Lexer::peek(int ahead) const { + std::size_t idx = pos_ + static_cast(ahead); + if (idx < input_.size()) { + return input_[idx]; + } + return '\0'; // sentinel for "no more chars" +} + +char Lexer::peek() const { + return peek(0); +} + +char Lexer::advance() { + char ch = peek(); + if (ch == '\0') { + return '\0'; // don't move past end + } + + ++pos_; + + // update logical location + if (ch == '\n') { + ++row; + col = 1; + } else if (ch == '\t') { + col = ((col - 1) / 4) * 4 + 5; + } else { + ++col; + } + return ch; +} + +bool Lexer::advanceIf(char ch) { + if (peek() != ch) { + return false; + } + advance(); + return true; +} + +void Lexer::advance(char ch) { + if (!advanceIf(ch)) { + throw SyntaxError(std::string("'") + ch + "' expected", row, col); + } +} + +bool Lexer::skipWhitespace() { + while (true) { + char p = peek(); + if (p == ' ' || p == '\t') { + advance(); + } else { + break; + } + } + char p = peek(); + return (p == '\n' || p == '\0'); +} + +std::string Lexer::readTo(char delimiter) { + mark(); + while (peek() > 0 && peek() != delimiter) { + advance(); + } + if (peek() == delimiter) { + advance(); // consume delimiter + } + // exclude delimiter itself (like Java's extract(-1)) + return extract(-1); +} + +std::string Lexer::readAlphanumeric() { + mark(); + while (true) { + char c = peek(); + if (std::isalnum(static_cast(c)) || c == '_') { + advance(); + } else { + break; + } + } + return extract(); +} + +int Lexer::digitValue(char c, int radix) { + if (radix < 2 || radix > 36) return -1; + int v = -1; + if (c >= '0' && c <= '9') { + v = c - '0'; + } else if (c >= 'A' && c <= 'Z') { + v = c - 'A' + 10; + } else if (c >= 'a' && c <= 'z') { + v = c - 'a' + 10; + } + if (v >= 0 && v < radix) return v; + return -1; +} + +std::string Lexer::readDigits(int radix) { + mark(); + while (digitValue(peek(), radix) != -1) { + advance(); + } + return extract(); +} diff --git a/simulator_SIC_XE/src/mnemonic.cpp b/simulator_SIC_XE/src/mnemonic.cpp deleted file mode 100644 index 4944082..0000000 --- a/simulator_SIC_XE/src/mnemonic.cpp +++ /dev/null @@ -1,6 +0,0 @@ -#include "mnemonic.h" - -string Mnemonic::toString() const -{ - return string(); -} \ No newline at end of file diff --git a/simulator_SIC_XE/src/node.cpp b/simulator_SIC_XE/src/node.cpp index f0fccdb..c0b52fd 100644 --- a/simulator_SIC_XE/src/node.cpp +++ b/simulator_SIC_XE/src/node.cpp @@ -1,22 +1,120 @@ #include "node.h" +#include +#include -string Node::getLabel() const -{ - return _label; +string Node::toString() const { + std::ostringstream oss; + if (!_label.empty()) oss << _label << " "; + if (_mnemonic) oss << _mnemonic->toString() << " "; + if (!_comment.empty()) oss << "." << _comment; + return oss.str(); } -string Node::getComment() const -{ - return _comment; +std::string Mnemonic::toString() const { + std::ostringstream oss; + oss << "[OP:" << std::hex << (int)_opcode << "]"; + if (_extended) oss << "+"; + // Print operands + for (size_t i = 0; i < _operands.size(); ++i) { + if (i > 0) oss << ","; + std::visit([&](auto&& arg) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + // nothing + } else if constexpr (std::is_same_v) { + oss << "R" << arg.num; + } else if constexpr (std::is_same_v) { + oss << "#" << arg.value; + } else if constexpr (std::is_same_v) { + oss << arg.name; + if (arg.indexed) oss << ",X"; + } + }, _operands[i]); + } + return oss.str(); } -std::shared_ptr Node::getMnemonic() const -{ - return _mnemonic; +string InstructionNode::toString() const { + std::ostringstream oss; + if (!_label.empty()) oss << _label << " "; + if (_mnemonic) oss << _mnemonic->toString(); + if (!_comment.empty()) oss << " ." << _comment; + return oss.str(); } -string Node::toString() const -{ - return (_label.length() > 0 ? _label + " " : "") + (_mnemonic ? _mnemonic->toString() + " ": "") + "." + _comment; +string CommentNode::toString() const { + return "." + _comment; +} + +string DirectiveNode::toString() const { + std::ostringstream oss; + if (!_label.empty()) oss << _label << " "; + switch (_kind) { + case DirectiveKind::START: oss << "START"; break; + case DirectiveKind::END: oss << "END"; break; + case DirectiveKind::BASE: oss << "BASE"; break; + case DirectiveKind::NOBASE: oss << "NOBASE"; break; + case DirectiveKind::EQU: oss << "EQU"; break; + case DirectiveKind::ORG: oss << "ORG"; break; + case DirectiveKind::LTORG: oss << "LTORG"; break; + case DirectiveKind::EXTDEF: oss << "EXTDEF"; break; + case DirectiveKind::EXTREF: oss << "EXTREF"; break; + case DirectiveKind::CSECT: oss << "CSECT"; break; + } + std::visit([&](auto&& arg) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + // no arg + } else if constexpr (std::is_same_v) { + oss << " " << std::hex << arg; + } else if constexpr (std::is_same_v) { + oss << " " << arg; + } else if constexpr (std::is_same_v>) { + for (size_t i = 0; i < arg.size(); ++i) { + if (i > 0) oss << ","; + oss << arg[i]; + } + } + }, _arg); + if (!_comment.empty()) oss << " ." << _comment; + return oss.str(); +} + +string DataNode::toString() const { + std::ostringstream oss; + if (!_label.empty()) oss << _label << " "; + switch (_kind) { + case DataKind::WORD: oss << "WORD"; break; + case DataKind::BYTE: oss << "BYTE"; break; + case DataKind::RESW: oss << "RESW"; break; + case DataKind::RESB: oss << "RESB"; break; + } + std::visit([&](auto&& arg) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + // no value + } else if constexpr (std::is_same_v) { + oss << " " << arg; + } else if constexpr (std::is_same_v>) { + // Try to display as string if all printable ASCII + bool isPrintable = !arg.empty() && std::all_of(arg.begin(), arg.end(), + [](uint8_t b) { return b >= 32 && b <= 126; }); + + if (isPrintable) { + oss << " C'"; + for (uint8_t b : arg) oss << static_cast(b); + oss << "'"; + } else { + // Display as hex + oss << " X'"; + for (uint8_t b : arg) { + oss << std::hex << std::setw(2) << std::setfill('0') << (int)b; + } + oss << "'"; + } + } + }, _value); + if (!_comment.empty()) oss << " ." << _comment; + return oss.str(); } \ No newline at end of file diff --git a/simulator_SIC_XE/src/opcode.cpp b/simulator_SIC_XE/src/opcode.cpp index 0b63ce8..fcda0c4 100644 --- a/simulator_SIC_XE/src/opcode.cpp +++ b/simulator_SIC_XE/src/opcode.cpp @@ -95,8 +95,36 @@ void loadInstructionSet() if (instructions[i].name == nullptr) instructions[i] = {"INVALID", InstructionType::INVALID, nullptr}; if (instructionsEXEX[i].name == nullptr) instructionsEXEX[i] = {"INVALID", InstructionType::INVALID, nullptr}; } + + // Initialize mnemonicToOpcode map + for (int i = 0; i < 0xff; ++i) { + if (instructions[i].type != InstructionType::INVALID) { + mnemonicToOpcode.emplace(instructions[i].name, static_cast(i)); + } + if (instructionsEXEX[i].type != InstructionType::INVALID) { + mnemonicToOpcode.emplace(instructionsEXEX[i].name, static_cast(i)); + } + } + opcodeTablesInitialized = true; } +std::optional findOpcodeByMnemonic(std::string_view name) +{ + auto it = mnemonicToOpcode.find(name); + if (it == mnemonicToOpcode.end()) + return std::nullopt; + return it->second; +} + +const InstructionInfo& getInstructionInfo(uint8_t opcode) +{ + if (instructions[opcode].type != InstructionType::INVALID) + return instructions[opcode]; + return instructionsEXEX[opcode]; +} + + + AddressingMode getAddressingMode(int ni) { switch (ni) { diff --git a/simulator_SIC_XE/src/parser.cpp b/simulator_SIC_XE/src/parser.cpp new file mode 100644 index 0000000..c5bf939 --- /dev/null +++ b/simulator_SIC_XE/src/parser.cpp @@ -0,0 +1,449 @@ +// parser.cpp +#include "parser.h" +#include +#include +#include + +void Parser::initMnemonicMap() { + if (s_mnemonicMapInitialized) return; + + loadInstructionSet(); + + for (int op = 0; op < 0xFF; ++op) { + const auto& info = instructions[op]; + if (info.name && info.type != InstructionType::INVALID) { + s_nameToOpcode.emplace(info.name, static_cast(op)); + } + const auto& ex = instructionsEXEX[op]; + if (ex.name && ex.type != InstructionType::INVALID) { + s_nameToOpcode.emplace(ex.name, static_cast(op)); + } + } + + s_mnemonicMapInitialized = true; +} + +std::shared_ptr Parser::makeMnemonic(const std::string& name, bool extended) { + initMnemonicMap(); + + auto it = s_nameToOpcode.find(name); + if (it == s_nameToOpcode.end()) { + throw SyntaxError("Invalid mnemonic '" + name + "'", lexer_.row, lexer_.col); + } + + std::uint8_t opcode = it->second; + const InstructionInfo* info = nullptr; + + if (instructions[opcode].type != InstructionType::INVALID) { + info = &instructions[opcode]; + } else if (instructionsEXEX[opcode].type != InstructionType::INVALID) { + info = &instructionsEXEX[opcode]; + } + + if (!info) { + throw SyntaxError("Invalid mnemonic '" + name + "'", lexer_.row, lexer_.col); + } + + if (extended && info->type != InstructionType::TYPE3_4) { + throw SyntaxError( + "Extended format not allowed for mnemonic '" + name + "'", + lexer_.row, + lexer_.col + ); + } + + return std::make_shared(opcode, info->type, extended); +} + +std::string Parser::parseLabel() { + if (lexer_.col == 1 && std::isalpha(static_cast(lexer_.peek()))) { + return std::string(lexer_.readAlphanumeric()); + } + return {}; +} + +std::shared_ptr Parser::parseMnemonic() { + bool isExtended = lexer_.advanceIf('+'); + std::string name(lexer_.readAlphanumeric()); + if (name.empty()) { + throw SyntaxError("Mnemonic expected", lexer_.row, lexer_.col); + } + return makeMnemonic(name, isExtended); +} + +std::string Parser::parseSymbol() { + return std::string(lexer_.readAlphanumeric()); +} + +int Parser::parseRegister() { + char ch = lexer_.advance(); + constexpr std::string_view regs = "AXLBSTF"; + auto pos = regs.find(ch); + if (pos == std::string_view::npos) { + throw SyntaxError(std::string("Invalid register '") + ch + "'", lexer_.row, lexer_.col); + } + return static_cast(pos); +} + +void Parser::parseComma() { + lexer_.skipWhitespace(); + lexer_.advance(','); + lexer_.skipWhitespace(); +} + +bool Parser::parseIndexed() { + lexer_.skipWhitespace(); + if (lexer_.advanceIf(',')) { + lexer_.skipWhitespace(); + lexer_.advance('X'); + return true; + } + return false; +} + +static int digitValue(char c, int radix) { + if (radix < 2 || radix > 36) return -1; + int v = -1; + if (c >= '0' && c <= '9') v = c - '0'; + else if (c >= 'A' && c <= 'Z') v = c - 'A' + 10; + else if (c >= 'a' && c <= 'z') v = c - 'a' + 10; + if (v >= 0 && v < radix) return v; + return -1; +} + +int Parser::parseNumber(int lo, int hi) { + auto parseDigits = [&](int radix) -> int { + std::string digits(lexer_.readDigits(radix)); + if (digits.empty()) { + throw SyntaxError("Invalid number", lexer_.row, lexer_.col); + } + + long long value = 0; + for (char c : digits) { + int d = digitValue(c, radix); + if (d < 0) throw SyntaxError("Invalid number", lexer_.row, lexer_.col); + value = value * radix + d; + if (value > std::numeric_limits::max()) { + throw SyntaxError("Invalid number", lexer_.row, lexer_.col); + } + } + return static_cast(value); + }; + + int num = 0; + + if (lexer_.peek() == '0') { + int radix = -1; + switch (lexer_.peek(1)) { + case 'b': radix = 2; break; + case 'o': radix = 8; break; + case 'x': radix = 16; break; + default: break; + } + if (radix != -1) { + lexer_.advance(); + lexer_.advance(); + num = parseDigits(radix); + } else { + num = parseDigits(10); + } + } else if (std::isdigit(static_cast(lexer_.peek()))) { + num = parseDigits(10); + } else { + throw SyntaxError("Number expected", lexer_.row, lexer_.col); + } + + if (std::isalnum(static_cast(lexer_.peek()))) { + throw SyntaxError( + std::string("invalid digit '") + lexer_.peek() + "'", + lexer_.row, + lexer_.col + ); + } + + if (num < lo || num > hi) { + throw SyntaxError( + "Number '" + std::to_string(num) + "' out of range [" + + std::to_string(lo) + ".." + std::to_string(hi) + "]", + lexer_.row, + lexer_.col + ); + } + + return num; +} + +std::vector Parser::parseData() { + if (lexer_.advanceIf('C')) { + lexer_.advance('\''); + std::string s(lexer_.readTo('\'')); + std::vector data; + data.reserve(s.size()); + for (unsigned char c : s) { + data.push_back(static_cast(c)); + } + return data; + } + + if (lexer_.advanceIf('X')) { + lexer_.advance('\''); + std::string s(lexer_.readTo('\'')); + if (s.size() % 2 != 0) { + throw SyntaxError("Invalid hex literal length", lexer_.row, lexer_.col); + } + + std::vector data; + data.reserve(s.size() / 2); + + auto hexVal = [](char c) -> int { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + return -1; + }; + + for (std::size_t i = 0; i < s.size(); i += 2) { + int hi = hexVal(s[i]); + int lo = hexVal(s[i + 1]); + if (hi < 0 || lo < 0) { + throw SyntaxError("Invalid hex digit in literal", lexer_.row, lexer_.col); + } + data.push_back(static_cast((hi << 4) | lo)); + } + return data; + } + + if (std::isdigit(static_cast(lexer_.peek()))) { + constexpr int MAX_WORD = 0xFFFFFF; + int num = parseNumber(0, MAX_WORD); + return { + static_cast((num >> 16) & 0xFF), + static_cast((num >> 8) & 0xFF), + static_cast(num & 0xFF) + }; + } + + throw SyntaxError( + std::string("Invalid storage specifier '") + lexer_.peek() + "'", + lexer_.row, + lexer_.col + ); +} + +void Parser::parseOperands(Mnemonic& m) { + InstructionType t = m.type(); + char c = lexer_.peek(); + + if (t == InstructionType::TYPE1) { + // TYPE1 has no operands + return; + } + + if (t == InstructionType::TYPE2) { + // TYPE2: r1 or r1,r2 or r1,n + if (c == '\n' || c == '\0') return; + + int r1 = parseRegister(); + m.operands().emplace_back(Register{r1}); + lexer_.skipWhitespace(); + + if (lexer_.peek() == ',') { + parseComma(); + char c2 = lexer_.peek(); + if (std::isalpha(static_cast(c2))) { + int r2 = parseRegister(); + m.operands().emplace_back(Register{r2}); + } else if (std::isdigit(static_cast(c2))) { + int n = parseNumber(0, 0xFFFF); + m.operands().emplace_back(Immediate{n}); + } else { + throw SyntaxError("Invalid second operand", lexer_.row, lexer_.col); + } + } + + return; + } + + if (t == InstructionType::TYPE3_4) { + lexer_.skipWhitespace(); + char c0 = lexer_.peek(); + if (c0 == '\n' || c0 == '\0') { + // No operand (e.g., RSUB) + return; + } + + bool immediate = false; + bool indirect = false; + + if (lexer_.advanceIf('#')) { + immediate = true; + } else if (lexer_.advanceIf('@')) { + indirect = true; + } + + char c1 = lexer_.peek(); + if (std::isdigit(static_cast(c1))) { + int num = parseNumber(0, 0x7FFFFF); + if (immediate) { + m.operands().emplace_back(Immediate{num}); + } else { + // Direct numeric addressing (rare, treat as immediate) + m.operands().emplace_back(Immediate{num}); + } + } else if (std::isalpha(static_cast(c1))) { + std::string symbol = parseSymbol(); + bool indexed = parseIndexed(); + m.operands().emplace_back(SymbolRef{symbol, indexed, immediate, indirect}); + } else { + throw SyntaxError("Invalid operand", lexer_.row, lexer_.col); + } + + return; + } +} + +bool Parser::isDirective(const std::string& name) { + return name == "START" || name == "END" || name == "BASE" || name == "NOBASE" || + name == "EQU" || name == "ORG" || name == "LTORG" || + name == "EXTDEF" || name == "EXTREF" || name == "CSECT"; +} + +bool Parser::isDataDirective(const std::string& name) { + return name == "WORD" || name == "BYTE" || name == "RESW" || name == "RESB"; +} + +std::shared_ptr Parser::parseDirective(const std::string& label, const std::string& directive) { + lexer_.skipWhitespace(); + + DirectiveArg argValue; + char c = lexer_.peek(); + + // Parse argument based on first character + if (std::isalpha(c)) { + std::string arg = std::string(lexer_.readAlphanumeric()); + argValue = arg; + } else if (std::isdigit(c) || c == '0') { + int num = parseNumber(0, 0xFFFFFF); + argValue = num; + } else { + // No argument + argValue = std::monostate{}; + } + + lexer_.skipWhitespace(); + std::string comment = std::string(lexer_.readTo('\n')); + + DirectiveKind kind; + if (directive == "START") kind = DirectiveKind::START; + else if (directive == "END") kind = DirectiveKind::END; + else if (directive == "BASE") kind = DirectiveKind::BASE; + else if (directive == "NOBASE") kind = DirectiveKind::NOBASE; + else if (directive == "EQU") kind = DirectiveKind::EQU; + else if (directive == "ORG") kind = DirectiveKind::ORG; + else if (directive == "LTORG") kind = DirectiveKind::LTORG; + else if (directive == "EXTDEF") kind = DirectiveKind::EXTDEF; + else if (directive == "EXTREF") kind = DirectiveKind::EXTREF; + else if (directive == "CSECT") kind = DirectiveKind::CSECT; + else throw SyntaxError("Unknown directive", lexer_.row, lexer_.col); + + return std::make_shared(label, kind, argValue, comment); +} + +std::shared_ptr Parser::parseDataDirective(const std::string& label, const std::string& directive) { + lexer_.skipWhitespace(); + + DataKind kind; + if (directive == "WORD") kind = DataKind::WORD; + else if (directive == "BYTE") kind = DataKind::BYTE; + else if (directive == "RESW") kind = DataKind::RESW; + else if (directive == "RESB") kind = DataKind::RESB; + else throw SyntaxError("Unknown data directive", lexer_.row, lexer_.col); + + DataValue value; + if (kind == DataKind::WORD || kind == DataKind::RESW || kind == DataKind::RESB) { + int num = parseNumber(0, 0xFFFFFF); + value = num; + } else { // BYTE + auto bytes = parseData(); + value = bytes; + } + + lexer_.skipWhitespace(); + std::string comment = std::string(lexer_.readTo('\n')); + + return std::make_shared(label, kind, value, comment); +} + +std::shared_ptr Parser::parseInstruction() { + if (lexer_.col == 1 && lexer_.peek() == '.') { + return std::make_shared( + std::string(lexer_.readTo('\n')) + ); + } + + std::string label = parseLabel(); + + if (lexer_.skipWhitespace() && label.empty()) { + lexer_.advance(); + return nullptr; + } + + lexer_.skipWhitespace(); + + // Check for extended format prefix + bool isExtended = lexer_.peek() == '+'; + if (isExtended) { + lexer_.advance(); + } + + std::string name = std::string(lexer_.readAlphanumeric()); + + if (name.empty()) { + throw SyntaxError("Mnemonic or directive expected", lexer_.row, lexer_.col); + } + + // Check if it's a directive or data directive + if (isDirective(name)) { + return parseDirective(label, name); + } + + if (isDataDirective(name)) { + return parseDataDirective(label, name); + } + + // It's an instruction - create mnemonic + auto mnemonic = makeMnemonic(name, isExtended); + lexer_.skipWhitespace(); + + parseOperands(*mnemonic); + lexer_.skipWhitespace(); + + std::string comment(lexer_.readTo('\n')); + + return std::make_shared( + std::move(label), + std::move(mnemonic), + std::move(comment) + ); +} + +Code Parser::parseCode() { + Code code; + + while (lexer_.peek() > 0) { + while (lexer_.peek() > 0 && lexer_.col > 1) { + lexer_.readTo('\n'); + } + + if (auto node = parseInstruction()) { + code.addLine(node); + } + } + + return code; +} + +Code Parser::parse(const std::string& input) { + lexer_ = Lexer(input); + return parseCode(); +}