fix(lexer): 修复了词法分析器中的一些问题

This commit is contained in:
Wankupi
2023-11-05 12:14:29 +08:00
parent 2e29af68b3
commit 4be04cec2d
3 changed files with 218 additions and 264 deletions

View File

@ -515,42 +515,39 @@ bool Python3Lexer::sempred(RuleContext *context, size_t ruleIndex, size_t predic
void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) { void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) {
switch (actionIndex) { switch (actionIndex) {
case 0: case 0:
{ { // Braces are required inside the switch
std::string pattern1="[^\r\n\f]+"; std::regex re1(R"([^\r\n\f]+)");
std::string pattern2="[\r\n\f]+"; std::regex re2(R"([\r\n\f]+)");
std::regex re1(pattern1); std::string newLine = regex_replace(getText(), re1, "");
std::regex re2(pattern2); std::string spaces = regex_replace(getText(), re2, "");
std::string fmt=""; int next = _input->LA(1);
std::string newLine=regex_replace(getText(),re1,fmt); if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
std::string spaces = regex_replace(getText(),re2,fmt); // If we're inside a list or on a blank line, ignore all indents,
int next = _input->LA(1); // dedents and line breaks.
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { skip();
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Lexer::INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.empty() && indents.top() > indent) {
this->emit(createDedent());
indents.pop();
}
}
}
} }
break; else {
emit(make_CommonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) {
// skip indents of the same size as the present indent-size
// do nothing
}
else if (indent > previous) {
indents.push(indent);
emit(make_CommonToken(Python3Lexer::INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while (!indents.empty() && indents.top() > indent) {
this->emit(createDedent());
indents.pop();
}
}
}
}
break;
default: default:
break; break;

View File

@ -39,110 +39,90 @@ public:
~Python3Lexer() override; ~Python3Lexer() override;
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule). private:
private: std::list<antlr4::Token*> tokens ; // A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
// The stack that keeps track of the indentation level. std::list<antlr4::Token *> tokens;
private: std::stack<int> indents ; private:
// The amount of opened braces, brackets and parenthesis. // The stack that keeps track of the indentation level.
private: int opened = 0; std::stack<int> indents;
// The most recently produced token. private:
private: antlr4::Token* lastToken = nullptr; // The amount of opened braces, brackets and parenthesis.
int opened = 0;
public:
void emit(std::unique_ptr<antlr4::Token> t) override {
tokens.push_back(t.get());
token.release();
token = std::move(t);
}
public: void emit(std::unique_ptr<antlr4::Token> t) override { public:
token.release(); std::unique_ptr<antlr4::Token> nextToken() override {
token=std::move(t); // Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input->LA(1) == EOF && !this->indents.empty()) {
// Remove any trailing EOF tokens from our buffer.
for (auto i = tokens.rbegin(); i != tokens.rend();) {
auto tmp = i;
i++;
if ((*tmp)->getType() == EOF) {
tokens.erase(tmp.base());
}
}
tokens.push_back(token.get()); // First emit an extra line break that serves as the end of the statement.
// std::cout<<t->toString()<<std::endl; emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
}
// Now emit as much DEDENT tokens as needed.
while (!indents.empty()) {
auto tmp = createDedent();
this->emit(std::move(tmp));
indents.pop();
}
public: std::unique_ptr<antlr4::Token> nextToken() override { // Put the EOF back on the token stream.
// Check if the end-of-file is ahead and there are still some DEDENTS expected. this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
if (_input->LA(1) == EOF && !this->indents.empty()) { }
// Remove any trailing EOF tokens from our buffer. if (tokens.empty()) {
for(auto i=tokens.rbegin();i!=tokens.rend();){ std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
auto tmp=i; next.release();
i++; // release it because it should be controlled by 'tokens' now
if((*tmp)->getType()==EOF){ }
tokens.erase(tmp.base()); auto tmp = tokens.front();
} tokens.pop_front();
} return std::unique_ptr<antlr4::Token>(tmp);
}
private:
std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
dedent->setText("DEDENT");
return std::move(dedent);
}
// First emit an extra line break that serves as the end of the statement. private:
std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n"); std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
this->emit(std::move(tmp)); size_t stop = this->getCharIndex() - 1;
size_t start = text.empty() ? stop : stop - text.length() + 1;
return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
}
// Now emit as much DEDENT tokens as needed. // Calculates the indentation of the provided spaces, taking the
while (!indents.empty()) { // following rules into account:
auto tmp=createDedent(); //
this->emit(std::move(tmp)); // "Tabs are replaced (from left to right) by one to eight spaces
indents.pop(); // such that the total number of characters up to and including
} // the replacement is a multiple of eight [...]"
//
// Put the EOF back on the token stream. // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>")); static int getIndentationCount(std::string const &spaces) {
} int count = 0;
for (auto ch : spaces)
std::unique_ptr<antlr4::Token> next = Lexer::nextToken(); if (ch == '\t') count += 8 - (count % 8);
else ++count; // normal space char
if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) { return count;
// Keep track of the last token on the default channel. }
this->lastToken = next.get(); bool atStartOfInput() {
} return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
if (tokens.empty()) { }
return std::move(next);
} else{
next.release();
auto tmp=tokens.front();
tokens.pop_front();
return std::unique_ptr<antlr4::Token>(tmp);
}
}
private: std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = commonToken(Python3Lexer::DEDENT, "");
dedent->setLine(this->lastToken->getLine());
return std::move(dedent);
}
private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
int stop = this->getCharIndex() - 1;
int start = text.empty() ? stop : stop - text.length() + 1;
return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
type,
DEFAULT_TOKEN_CHANNEL, start, stop)));
}
// Calculates the indentation of the provided spaces, taking the
// following rules into account:
//
// "Tabs are replaced (from left to right) by one to eight spaces
// such that the total number of characters up to and including
// the replacement is a multiple of eight [...]"
//
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(std::string spaces) {
int count = 0;
for (char ch : spaces) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count;
}
bool atStartOfInput() {
return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
}
std::string getGrammarFileName() const override; std::string getGrammarFileName() const override;

View File

@ -10,110 +10,90 @@ tokens {
} }
@lexer::members { @lexer::members {
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule). private:
private: std::list<antlr4::Token*> tokens ; // A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
// The stack that keeps track of the indentation level. std::list<antlr4::Token *> tokens;
private: std::stack<int> indents ; private:
// The amount of opened braces, brackets and parenthesis. // The stack that keeps track of the indentation level.
private: int opened = 0; std::stack<int> indents;
// The most recently produced token. private:
private: antlr4::Token* lastToken = nullptr; // The amount of opened braces, brackets and parenthesis.
int opened = 0;
public:
void emit(std::unique_ptr<antlr4::Token> t) override {
tokens.push_back(t.get());
token.release();
token = std::move(t);
}
public: void emit(std::unique_ptr<antlr4::Token> t) override { public:
token.release(); std::unique_ptr<antlr4::Token> nextToken() override {
token=std::move(t); // Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input->LA(1) == EOF && !this->indents.empty()) {
// Remove any trailing EOF tokens from our buffer.
for (auto i = tokens.rbegin(); i != tokens.rend();) {
auto tmp = i;
i++;
if ((*tmp)->getType() == EOF) {
tokens.erase(tmp.base());
}
}
tokens.push_back(token.get()); // First emit an extra line break that serves as the end of the statement.
// std::cout<<t->toString()<<std::endl; emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
}
// Now emit as much DEDENT tokens as needed.
while (!indents.empty()) {
auto tmp = createDedent();
this->emit(std::move(tmp));
indents.pop();
}
public: std::unique_ptr<antlr4::Token> nextToken() override { // Put the EOF back on the token stream.
// Check if the end-of-file is ahead and there are still some DEDENTS expected. this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
if (_input->LA(1) == EOF && !this->indents.empty()) { }
// Remove any trailing EOF tokens from our buffer. if (tokens.empty()) {
for(auto i=tokens.rbegin();i!=tokens.rend();){ std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
auto tmp=i; next.release();
i++; // release it because it should be controlled by 'tokens' now
if((*tmp)->getType()==EOF){ }
tokens.erase(tmp.base()); auto tmp = tokens.front();
} tokens.pop_front();
} return std::unique_ptr<antlr4::Token>(tmp);
}
private:
std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
dedent->setText("DEDENT");
return std::move(dedent);
}
// First emit an extra line break that serves as the end of the statement. private:
std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n"); std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
this->emit(std::move(tmp)); size_t stop = this->getCharIndex() - 1;
size_t start = text.empty() ? stop : stop - text.length() + 1;
return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
}
// Now emit as much DEDENT tokens as needed. // Calculates the indentation of the provided spaces, taking the
while (!indents.empty()) { // following rules into account:
auto tmp=createDedent(); //
this->emit(std::move(tmp)); // "Tabs are replaced (from left to right) by one to eight spaces
indents.pop(); // such that the total number of characters up to and including
} // the replacement is a multiple of eight [...]"
//
// Put the EOF back on the token stream. // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>")); static int getIndentationCount(std::string const &spaces) {
} int count = 0;
for (auto ch : spaces)
std::unique_ptr<antlr4::Token> next = Lexer::nextToken(); if (ch == '\t') count += 8 - (count % 8);
else ++count; // normal space char
if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) { return count;
// Keep track of the last token on the default channel. }
this->lastToken = next.get(); bool atStartOfInput() {
} return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
if (tokens.empty()) { }
return std::move(next);
} else{
next.release();
auto tmp=tokens.front();
tokens.pop_front();
return std::unique_ptr<antlr4::Token>(tmp);
}
}
private: std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = commonToken(Python3Lexer::DEDENT, "");
dedent->setLine(this->lastToken->getLine());
return std::move(dedent);
}
private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
int stop = this->getCharIndex() - 1;
int start = text.empty() ? stop : stop - text.length() + 1;
return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
type,
DEFAULT_TOKEN_CHANNEL, start, stop)));
}
// Calculates the indentation of the provided spaces, taking the
// following rules into account:
//
// "Tabs are replaced (from left to right) by one to eight spaces
// such that the total number of characters up to and including
// the replacement is a multiple of eight [...]"
//
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(std::string spaces) {
int count = 0;
for (char ch : spaces) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count;
}
bool atStartOfInput() {
return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
}
} }
STRING: STRING_LITERAL | BYTES_LITERAL; STRING: STRING_LITERAL | BYTES_LITERAL;
@ -147,42 +127,39 @@ NEWLINE: (
{atStartOfInput()}? SPACES {atStartOfInput()}? SPACES
| ( '\r'? '\n' | '\r' | '\f') SPACES? | ( '\r'? '\n' | '\r' | '\f') SPACES?
) { ) {
{ { // Braces are required inside the switch
std::string pattern1="[^\r\n\f]+"; std::regex re1(R"([^\r\n\f]+)");
std::string pattern2="[\r\n\f]+"; std::regex re2(R"([\r\n\f]+)");
std::regex re1(pattern1); std::string newLine = regex_replace(getText(), re1, "");
std::regex re2(pattern2); std::string spaces = regex_replace(getText(), re2, "");
std::string fmt=""; int next = _input->LA(1);
std::string newLine=regex_replace(getText(),re1,fmt); if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
std::string spaces = regex_replace(getText(),re2,fmt); // If we're inside a list or on a blank line, ignore all indents,
int next = _input->LA(1); // dedents and line breaks.
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { skip();
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Lexer::INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.empty() && indents.top() > indent) {
this->emit(createDedent());
indents.pop();
}
}
}
} }
}; else {
emit(make_CommonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) {
// skip indents of the same size as the present indent-size
// do nothing
}
else if (indent > previous) {
indents.push(indent);
emit(make_CommonToken(Python3Lexer::INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while (!indents.empty() && indents.top() > indent) {
this->emit(createDedent());
indents.pop();
}
}
}
}
};
/// identifier ::= id_start id_continue* /// identifier ::= id_start id_continue*
NAME: ID_START ID_CONTINUE*; NAME: ID_START ID_CONTINUE*;