fix(lexer): 修复了词法分析器中的一些问题

This commit is contained in:
Wankupi
2023-11-05 12:14:29 +08:00
parent 2e29af68b3
commit 4be04cec2d
3 changed files with 218 additions and 264 deletions

View File

@ -515,14 +515,11 @@ bool Python3Lexer::sempred(RuleContext *context, size_t ruleIndex, size_t predic
void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) { void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) {
switch (actionIndex) { switch (actionIndex) {
case 0: case 0:
{ { // Braces are required inside the switch
std::string pattern1="[^\r\n\f]+"; std::regex re1(R"([^\r\n\f]+)");
std::string pattern2="[\r\n\f]+"; std::regex re2(R"([\r\n\f]+)");
std::regex re1(pattern1); std::string newLine = regex_replace(getText(), re1, "");
std::regex re2(pattern2); std::string spaces = regex_replace(getText(), re2, "");
std::string fmt="";
std::string newLine=regex_replace(getText(),re1,fmt);
std::string spaces = regex_replace(getText(),re2,fmt);
int next = _input->LA(1); int next = _input->LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents, // If we're inside a list or on a blank line, ignore all indents,
@ -530,16 +527,16 @@ void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionInde
skip(); skip();
} }
else { else {
emit(commonToken(NEWLINE, newLine)); emit(make_CommonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces); int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top(); int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) { if (indent == previous) {
// skip indents of the same size as the present indent-size // skip indents of the same size as the present indent-size
skip(); // do nothing
} }
else if (indent > previous) { else if (indent > previous) {
indents.push(indent); indents.push(indent);
emit(commonToken(Python3Lexer::INDENT, spaces)); emit(make_CommonToken(Python3Lexer::INDENT, spaces));
} }
else { else {
// Possibly emit more than 1 DEDENT token. // Possibly emit more than 1 DEDENT token.

View File

@ -39,25 +39,24 @@ public:
~Python3Lexer() override; ~Python3Lexer() override;
private:
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule). // A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private: std::list<antlr4::Token*> tokens ; std::list<antlr4::Token *> tokens;
private:
// The stack that keeps track of the indentation level. // The stack that keeps track of the indentation level.
private: std::stack<int> indents ; std::stack<int> indents;
private:
// The amount of opened braces, brackets and parenthesis. // The amount of opened braces, brackets and parenthesis.
private: int opened = 0; int opened = 0;
// The most recently produced token. public:
private: antlr4::Token* lastToken = nullptr; void emit(std::unique_ptr<antlr4::Token> t) override {
tokens.push_back(t.get());
public: void emit(std::unique_ptr<antlr4::Token> t) override {
token.release(); token.release();
token = std::move(t); token = std::move(t);
tokens.push_back(token.get());
// std::cout<<t->toString()<<std::endl;
} }
public:
public: std::unique_ptr<antlr4::Token> nextToken() override { std::unique_ptr<antlr4::Token> nextToken() override {
// Check if the end-of-file is ahead and there are still some DEDENTS expected. // Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input->LA(1) == EOF && !this->indents.empty()) { if (_input->LA(1) == EOF && !this->indents.empty()) {
// Remove any trailing EOF tokens from our buffer. // Remove any trailing EOF tokens from our buffer.
@ -69,10 +68,8 @@ public:
} }
} }
// First emit an extra line break that serves as the end of the statement. // First emit an extra line break that serves as the end of the statement.
std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n"); emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
this->emit(std::move(tmp));
// Now emit as much DEDENT tokens as needed. // Now emit as much DEDENT tokens as needed.
while (!indents.empty()) { while (!indents.empty()) {
@ -82,38 +79,30 @@ public:
} }
// Put the EOF back on the token stream. // Put the EOF back on the token stream.
this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>")); this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
}
std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this->lastToken = next.get();
} }
if (tokens.empty()) { if (tokens.empty()) {
return std::move(next); std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
} else{
next.release(); next.release();
// release it because it should be controlled by 'tokens' now
}
auto tmp = tokens.front(); auto tmp = tokens.front();
tokens.pop_front(); tokens.pop_front();
return std::unique_ptr<antlr4::Token>(tmp); return std::unique_ptr<antlr4::Token>(tmp);
} }
} private:
std::unique_ptr<antlr4::Token> createDedent() {
private: std::unique_ptr<antlr4::Token> createDedent() { auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
auto dedent = commonToken(Python3Lexer::DEDENT, ""); dedent->setText("DEDENT");
dedent->setLine(this->lastToken->getLine());
return std::move(dedent); return std::move(dedent);
} }
private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) { private:
int stop = this->getCharIndex() - 1; std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
int start = text.empty() ? stop : stop - text.length() + 1; size_t stop = this->getCharIndex() - 1;
return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input }, size_t start = text.empty() ? stop : stop - text.length() + 1;
type, return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
DEFAULT_TOKEN_CHANNEL, start, stop)));
} }
// Calculates the indentation of the provided spaces, taking the // Calculates the indentation of the provided spaces, taking the
@ -124,22 +113,13 @@ public:
// the replacement is a multiple of eight [...]" // the replacement is a multiple of eight [...]"
// //
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(std::string spaces) { static int getIndentationCount(std::string const &spaces) {
int count = 0; int count = 0;
for (char ch : spaces) { for (auto ch : spaces)
switch (ch) { if (ch == '\t') count += 8 - (count % 8);
case '\t': else ++count; // normal space char
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count; return count;
} }
bool atStartOfInput() { bool atStartOfInput() {
return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1; return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
} }

View File

@ -10,25 +10,24 @@ tokens {
} }
@lexer::members { @lexer::members {
private:
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule). // A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private: std::list<antlr4::Token*> tokens ; std::list<antlr4::Token *> tokens;
private:
// The stack that keeps track of the indentation level. // The stack that keeps track of the indentation level.
private: std::stack<int> indents ; std::stack<int> indents;
private:
// The amount of opened braces, brackets and parenthesis. // The amount of opened braces, brackets and parenthesis.
private: int opened = 0; int opened = 0;
// The most recently produced token. public:
private: antlr4::Token* lastToken = nullptr; void emit(std::unique_ptr<antlr4::Token> t) override {
tokens.push_back(t.get());
public: void emit(std::unique_ptr<antlr4::Token> t) override {
token.release(); token.release();
token = std::move(t); token = std::move(t);
tokens.push_back(token.get());
// std::cout<<t->toString()<<std::endl;
} }
public:
public: std::unique_ptr<antlr4::Token> nextToken() override { std::unique_ptr<antlr4::Token> nextToken() override {
// Check if the end-of-file is ahead and there are still some DEDENTS expected. // Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input->LA(1) == EOF && !this->indents.empty()) { if (_input->LA(1) == EOF && !this->indents.empty()) {
// Remove any trailing EOF tokens from our buffer. // Remove any trailing EOF tokens from our buffer.
@ -40,10 +39,8 @@ tokens {
} }
} }
// First emit an extra line break that serves as the end of the statement. // First emit an extra line break that serves as the end of the statement.
std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n"); emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
this->emit(std::move(tmp));
// Now emit as much DEDENT tokens as needed. // Now emit as much DEDENT tokens as needed.
while (!indents.empty()) { while (!indents.empty()) {
@ -53,38 +50,30 @@ tokens {
} }
// Put the EOF back on the token stream. // Put the EOF back on the token stream.
this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>")); this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
}
std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this->lastToken = next.get();
} }
if (tokens.empty()) { if (tokens.empty()) {
return std::move(next); std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
} else{
next.release(); next.release();
// release it because it should be controlled by 'tokens' now
}
auto tmp = tokens.front(); auto tmp = tokens.front();
tokens.pop_front(); tokens.pop_front();
return std::unique_ptr<antlr4::Token>(tmp); return std::unique_ptr<antlr4::Token>(tmp);
} }
} private:
std::unique_ptr<antlr4::Token> createDedent() {
private: std::unique_ptr<antlr4::Token> createDedent() { auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
auto dedent = commonToken(Python3Lexer::DEDENT, ""); dedent->setText("DEDENT");
dedent->setLine(this->lastToken->getLine());
return std::move(dedent); return std::move(dedent);
} }
private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) { private:
int stop = this->getCharIndex() - 1; std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
int start = text.empty() ? stop : stop - text.length() + 1; size_t stop = this->getCharIndex() - 1;
return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input }, size_t start = text.empty() ? stop : stop - text.length() + 1;
type, return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
DEFAULT_TOKEN_CHANNEL, start, stop)));
} }
// Calculates the indentation of the provided spaces, taking the // Calculates the indentation of the provided spaces, taking the
@ -95,22 +84,13 @@ tokens {
// the replacement is a multiple of eight [...]" // the replacement is a multiple of eight [...]"
// //
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(std::string spaces) { static int getIndentationCount(std::string const &spaces) {
int count = 0; int count = 0;
for (char ch : spaces) { for (auto ch : spaces)
switch (ch) { if (ch == '\t') count += 8 - (count % 8);
case '\t': else ++count; // normal space char
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count; return count;
} }
bool atStartOfInput() { bool atStartOfInput() {
return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1; return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
} }
@ -147,14 +127,11 @@ NEWLINE: (
{atStartOfInput()}? SPACES {atStartOfInput()}? SPACES
| ( '\r'? '\n' | '\r' | '\f') SPACES? | ( '\r'? '\n' | '\r' | '\f') SPACES?
) { ) {
{ { // Braces are required inside the switch
std::string pattern1="[^\r\n\f]+"; std::regex re1(R"([^\r\n\f]+)");
std::string pattern2="[\r\n\f]+"; std::regex re2(R"([\r\n\f]+)");
std::regex re1(pattern1); std::string newLine = regex_replace(getText(), re1, "");
std::regex re2(pattern2); std::string spaces = regex_replace(getText(), re2, "");
std::string fmt="";
std::string newLine=regex_replace(getText(),re1,fmt);
std::string spaces = regex_replace(getText(),re2,fmt);
int next = _input->LA(1); int next = _input->LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents, // If we're inside a list or on a blank line, ignore all indents,
@ -162,16 +139,16 @@ NEWLINE: (
skip(); skip();
} }
else { else {
emit(commonToken(NEWLINE, newLine)); emit(make_CommonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces); int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top(); int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) { if (indent == previous) {
// skip indents of the same size as the present indent-size // skip indents of the same size as the present indent-size
skip(); // do nothing
} }
else if (indent > previous) { else if (indent > previous) {
indents.push(indent); indents.push(indent);
emit(commonToken(Python3Lexer::INDENT, spaces)); emit(make_CommonToken(Python3Lexer::INDENT, spaces));
} }
else { else {
// Possibly emit more than 1 DEDENT token. // Possibly emit more than 1 DEDENT token.