1-编译原理

1702 字
9 分钟
1-编译原理

实现DFA#

实现如下两个函数

bool frontend::DFA::next(char input, Token& buf) {
#ifdef DEBUG_DFA
#include<iostream>
std::cout << "in state [" << toString(cur_state) << "], input = \'" << input << "\', str = " << cur_str << "\t";
#endif
TODO;
#ifdef DEBUG_DFA
std::cout << "next state is [" << toString(cur_state) << "], next str = " << cur_str << "\t, ret = " << ret << std::endl;
#endif
}
void frontend::DFA::reset() {
cur_state = State::Empty;
cur_str = "";
}

5个状态并不够用,增加状态

enum class State
{
Empty, // space, \n, \r ...
Ident, // a keyword or identifier, like 'int' 'a0' 'else' ...
IntLiteral, // int literal, like '1' '1900', only in decimal
FloatLiteral, // float literal, like '0.1'
op_ext, // 操作符<,>,=,!
op_equ, // 操作符<=.>=,==.!=
op_and, // 操作符&
op_andand, // 操作符&&
op_or, // 操作符|
op_oror, // 操作符||
op // 其他操作符
};

设计具体的状态转移图

状态转移图
状态转移图

产出token后不一定消费了字符,故要设一个consumed变量

代码

#include"front/lexical.h"
#include<map>
#include<cassert>
#include<cctype>
#include<stdexcept>
#include<string>
#include<unordered_set>
#define TODO assert(0 && "todo")
// #define DEBUG_DFA
// #define DEBUG_SCANNER
namespace utils {
bool isExtop(char& c) {
std::unordered_set<char> ops = {
'<', '>', '=', '!'
};
return ops.count(c);
}
bool isOp(char& c) {
std::unordered_set<char> ops = {
'+', '-', '*', '/',
'%', ':', ';', ',',
'(', ')', '[', ']',
'{', '}'
};
return ops.count(c);
}
bool isAnd(char& c) {
return c == '&';
}
bool isOr(char& c) {
return c == '|';
}
bool isIdentStart(char c) {
return std::isalpha(static_cast<unsigned char>(c)) || c == '_';
}
bool isIdentChar(char c) {
return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
}
bool isHexDigit(char c) {
return std::isxdigit(static_cast<unsigned char>(c));
}
bool isOctDigit(char c) {
return c >= '0' && c <= '7';
}
frontend::TokenType keywordOrIdentType(const std::string& str) {
if (str == "const") return frontend::TokenType::CONSTTK;
if (str == "void") return frontend::TokenType::VOIDTK;
if (str == "int") return frontend::TokenType::INTTK;
if (str == "float") return frontend::TokenType::FLOATTK;
if (str == "if") return frontend::TokenType::IFTK;
if (str == "else") return frontend::TokenType::ELSETK;
if (str == "while") return frontend::TokenType::WHILETK;
if (str == "continue") return frontend::TokenType::CONTINUETK;
if (str == "break") return frontend::TokenType::BREAKTK;
if (str == "return") return frontend::TokenType::RETURNTK;
return frontend::TokenType::IDENFR;
}
frontend::TokenType operatorType(const std::string& str) {
if (str == "+") return frontend::TokenType::PLUS;
if (str == "-") return frontend::TokenType::MINU;
if (str == "*") return frontend::TokenType::MULT;
if (str == "/") return frontend::TokenType::DIV;
if (str == "%") return frontend::TokenType::MOD;
if (str == "<") return frontend::TokenType::LSS;
if (str == ">") return frontend::TokenType::GTR;
if (str == ":") return frontend::TokenType::COLON;
if (str == "=") return frontend::TokenType::ASSIGN;
if (str == ";") return frontend::TokenType::SEMICN;
if (str == ",") return frontend::TokenType::COMMA;
if (str == "(") return frontend::TokenType::LPARENT;
if (str == ")") return frontend::TokenType::RPARENT;
if (str == "[") return frontend::TokenType::LBRACK;
if (str == "]") return frontend::TokenType::RBRACK;
if (str == "{") return frontend::TokenType::LBRACE;
if (str == "}") return frontend::TokenType::RBRACE;
if (str == "!") return frontend::TokenType::NOT;
if (str == "<=") return frontend::TokenType::LEQ;
if (str == ">=") return frontend::TokenType::GEQ;
if (str == "==") return frontend::TokenType::EQL;
if (str == "!=") return frontend::TokenType::NEQ;
if (str == "&&") return frontend::TokenType::AND;
if (str == "||") return frontend::TokenType::OR;
assert(0 && "invalid operator");
return frontend::TokenType::PLUS;
}
std::string preprocess(std::ifstream& fin) {
enum class CommentState {
Normal,
Slash,
LineComment,
BlockComment,
BlockCommentStar,
};
std::string ret;
CommentState state = CommentState::Normal;
char c;
while (fin.get(c)) {
switch (state) {
case CommentState::Normal:
if (c == '/') {
state = CommentState::Slash;
} else {
ret += c;
}
break;
case CommentState::Slash:
if (c == '/') {
state = CommentState::LineComment;
} else if (c == '*') {
state = CommentState::BlockComment;
} else {
ret += '/';
ret += c;
state = CommentState::Normal;
}
break;
case CommentState::LineComment:
if (c == '\n') {
ret += '\n';
state = CommentState::Normal;
}
break;
case CommentState::BlockComment:
if (c == '*') {
state = CommentState::BlockCommentStar;
} else if (c == '\n') {
ret += '\n';
}
break;
case CommentState::BlockCommentStar:
if (c == '/') {
state = CommentState::Normal;
} else if (c == '*') {
state = CommentState::BlockCommentStar;
} else {
if (c == '\n') {
ret += '\n';
}
state = CommentState::BlockComment;
}
break;
default:
assert(0 && "invalid preprocess state");
}
}
if (state == CommentState::Slash) {
ret += '/';
} else if (state == CommentState::BlockComment || state == CommentState::BlockCommentStar) {
throw std::runtime_error("lexical error: unterminated block comment");
}
ret += ' ';
return ret;
}
} // namespace utils
std::string frontend::toString(State s) {
switch (s) {
case State::Empty: return "Empty";
case State::Ident: return "Ident";
case State::IntLiteral: return "IntLiteral";
case State::FloatLiteral: return "FloatLiteral";
case State::op_ext: return "op_ext";
case State::op_equ: return "op_equ";
case State::op_and: return "op_and";
case State::op_andand: return "op_andand";
case State::op_or: return "op_or";
case State::op_oror: return "op_oror";
case State::op: return "op";
default: assert(0 && "invalid State");
}
return "";
}
std::set<std::string> frontend::keywords= {
"const", "int", "float", "if", "else", "while", "continue", "break", "return", "void"
};
frontend::DFA::DFA(): cur_state(frontend::State::Empty), cur_str() {}
frontend::DFA::~DFA() {}
bool frontend::DFA::next(char input, Token& buf, bool& consumed) {
bool ret = false;
#ifdef DEBUG_DFA
#include<iostream>
std::cout << "in state [" << toString(cur_state) << "], input = \'" << input << "\', str = " << cur_str << "\t";
#endif
switch (cur_state) {
case State::Empty :
if (std::isspace(static_cast<unsigned char>(input))) break;
cur_str += input;
if (std::isdigit(static_cast<unsigned char>(input))) {
cur_state = State::IntLiteral;
} else if (input == '.') {
cur_state = State::FloatLiteral;
} else if (utils::isIdentStart(input)) {
cur_state = State::Ident;
} else if (utils::isExtop(input)) {
cur_state = State::op_ext;
} else if (utils::isAnd(input)) {
cur_state = State::op_and;
} else if (utils::isOr(input)) {
cur_state = State::op_or;
} else if (utils::isOp(input)) {
cur_state = State::op;
} else {
throw std::runtime_error(std::string("lexical error: invalid character '") + input + "'");
}
break;
case State::IntLiteral :
if (cur_str == "0") {
if (input == 'x' || input == 'X') {
cur_str += input;
break;
}
if (input == '.') {
cur_str += input;
cur_state = State::FloatLiteral;
break;
}
if (utils::isOctDigit(input)) {
cur_str += input;
break;
}
if (std::isdigit(static_cast<unsigned char>(input))) {
throw std::runtime_error("lexical error: invalid octal literal");
}
consumed = 0;
buf = {TokenType::INTLTR, cur_str};
reset();
ret = true;
break;
}
if (cur_str.size() >= 2 && cur_str[0] == '0' && (cur_str[1] == 'x' || cur_str[1] == 'X')) {
if (utils::isHexDigit(input)) {
cur_str += input;
break;
}
if (cur_str.size() == 2) {
throw std::runtime_error("lexical error: invalid hex literal");
}
consumed = 0;
buf = {TokenType::INTLTR, cur_str};
reset();
ret = true;
break;
}
if (cur_str.size() >= 2 && cur_str[0] == '0') {
if (input == '.') {
cur_str += input;
cur_state = State::FloatLiteral;
break;
}
if (utils::isOctDigit(input)) {
cur_str += input;
break;
}
if (std::isdigit(static_cast<unsigned char>(input))) {
throw std::runtime_error("lexical error: invalid octal literal");
}
consumed = 0;
buf = {TokenType::INTLTR, cur_str};
reset();
ret = true;
break;
}
if (std::isdigit(static_cast<unsigned char>(input))) {
cur_str += input;
break;
}
if (input == '.') {
cur_str += input;
cur_state = State::FloatLiteral;
break;
}
consumed = 0;
buf = {TokenType::INTLTR, cur_str};
reset();
ret = true;
break;
case State::FloatLiteral:
if (std::isdigit(static_cast<unsigned char>(input))) {
cur_str += input;
break;
}
if (cur_str == ".") {
throw std::runtime_error("lexical error: invalid float literal");
}
consumed = 0;
buf = {TokenType::FLOATLTR, cur_str};
reset();
ret = true;
break;
case State::Ident:
if (utils::isIdentChar(input)) {
cur_str += input;
break;
}
consumed = 0;
buf = {utils::keywordOrIdentType(cur_str), cur_str};
reset();
ret = true;
break;
case State::op_ext:
if (input == '=') {
cur_str += input;
cur_state = State::op_equ;
break;
}
consumed = 0;
buf = {utils::operatorType(cur_str), cur_str};
reset();
ret = true;
break;
case State::op_equ:
consumed = 0;
buf = {utils::operatorType(cur_str), cur_str};
reset();
ret = true;
break;
case State::op_and:
if (input == '&') {
cur_str += input;
cur_state = State::op_andand;
break;
}
throw std::runtime_error("lexical error: invalid operator '&'");
break;
case State::op_andand:
consumed = 0;
buf = {utils::operatorType(cur_str), cur_str};
reset();
ret = true;
break;
case State::op_or:
if (input == '|') {
cur_str += input;
cur_state = State::op_oror;
break;
}
throw std::runtime_error("lexical error: invalid operator '|'");
break;
case State::op_oror:
consumed = 0;
buf = {utils::operatorType(cur_str), cur_str};
reset();
ret = true;
break;
case State::op:
consumed = 0;
buf = {utils::operatorType(cur_str), cur_str};
reset();
ret = true;
break;
default:
assert(0 && "invalid State");
}
#ifdef DEBUG_DFA
std::cout << "next state is [" << toString(cur_state) << "], next str = " << cur_str << "\t, ret = " << ret << std::endl;
#endif
return ret;
}
void frontend::DFA::reset() {
cur_state = State::Empty;
cur_str = "";
}
frontend::Scanner::Scanner(std::string filename): fin(filename) {
if(!fin.is_open()) {
throw std::runtime_error("lexical error: input file cannot open");
}
}
frontend::Scanner::~Scanner() {
fin.close();
}
std::vector<frontend::Token> frontend::Scanner::run() {
std::vector<frontend::Token> ret;
frontend::DFA dfa;
frontend::Token tk;
std::string s = utils::preprocess(fin);
size_t i = 0;
while (i < s.size()) {
bool consumed = 1;
if (dfa.next(s[i], tk, consumed)) {
#ifdef DEBUG_SCANNER
#include<iostream>
std::cout << "token: " << toString(tk.type) << "\t" << tk.value << std::endl;
#endif
ret.push_back(tk);
}
if (consumed) ++i;
}
return ret;
}

实现扫描器#

文件预处理,五个状态的状态机

预处理状态机
预处理状态机

std::string preprocess(std::ifstream& fin) {
enum class CommentState {
Normal,
Slash,
LineComment,
BlockComment,
BlockCommentStar,
};
std::string ret;
CommentState state = CommentState::Normal;
char c;
while (fin.get(c)) {
switch (state) {
case CommentState::Normal:
if (c == '/') {
state = CommentState::Slash;
} else {
ret += c;
}
break;
case CommentState::Slash:
if (c == '/') {
state = CommentState::LineComment;
} else if (c == '*') {
state = CommentState::BlockComment;
} else {
ret += '/';
ret += c;
state = CommentState::Normal;
}
break;
case CommentState::LineComment:
if (c == '\n') {
ret += '\n';
state = CommentState::Normal;
}
break;
case CommentState::BlockComment:
if (c == '*') {
state = CommentState::BlockCommentStar;
} else if (c == '\n') {
ret += '\n';
}
break;
case CommentState::BlockCommentStar:
if (c == '/') {
state = CommentState::Normal;
} else if (c == '*') {
state = CommentState::BlockCommentStar;
} else {
if (c == '\n') {
ret += '\n';
}
state = CommentState::BlockComment;
}
break;
default:
assert(0 && "invalid preprocess state");
}
}
if (state == CommentState::Slash) {
ret += '/';
} else if (state == CommentState::BlockComment || state == CommentState::BlockCommentStar) {
assert(0 && "unterminated block comment");
}
ret += ' ';
return ret;
}

调试指令

Terminal window
cd /home/skaco2/Compiler/test
python3 build.py
cd /home/skaco2/Compiler
./bin/compiler ./test/SysY.cpp -s0 -o ./test/SysY.tk

文章分享

如果这篇文章对你有帮助,欢迎分享给更多人!

1-编译原理
https://skaco2.com/posts/05-class/1-编译原理/
作者
SKACO2
发布于
2026-04-12
许可协议
CC BY-NC-SA 4.0

评论区

Profile Image of the Author
SKACO2
Hello……
公告
欢迎来到我的博客!
音乐
封面

音乐

暂未播放

0:00 0:00
暂无歌词
分类
标签
站点统计
文章
53
分类
8
标签
54
总字数
58,255
运行时长
0
最后活动
0 天前

目录