fix(lexer): 修复了词法分析器中的一些问题

This commit is contained in:
Wankupi
2023-11-05 12:14:29 +08:00
parent 2e29af68b3
commit 4be04cec2d
3 changed files with 218 additions and 264 deletions

View File

@ -515,14 +515,11 @@ bool Python3Lexer::sempred(RuleContext *context, size_t ruleIndex, size_t predic
void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) {
switch (actionIndex) {
case 0:
{
std::string pattern1="[^\r\n\f]+";
std::string pattern2="[\r\n\f]+";
std::regex re1(pattern1);
std::regex re2(pattern2);
std::string fmt="";
std::string newLine=regex_replace(getText(),re1,fmt);
std::string spaces = regex_replace(getText(),re2,fmt);
{ // Braces are required inside the switch
std::regex re1(R"([^\r\n\f]+)");
std::regex re2(R"([\r\n\f]+)");
std::string newLine = regex_replace(getText(), re1, "");
std::string spaces = regex_replace(getText(), re2, "");
int next = _input->LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
@ -530,16 +527,16 @@ void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionInde
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
emit(make_CommonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
// do nothing
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Lexer::INDENT, spaces));
emit(make_CommonToken(Python3Lexer::INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.

View File

@ -39,25 +39,24 @@ public:
~Python3Lexer() override;
private:
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private: std::list<antlr4::Token*> tokens ;
std::list<antlr4::Token *> tokens;
private:
// The stack that keeps track of the indentation level.
private: std::stack<int> indents ;
std::stack<int> indents;
private:
// The amount of opened braces, brackets and parenthesis.
private: int opened = 0;
// The most recently produced token.
private: antlr4::Token* lastToken = nullptr;
public: void emit(std::unique_ptr<antlr4::Token> t) override {
int opened = 0;
public:
void emit(std::unique_ptr<antlr4::Token> t) override {
tokens.push_back(t.get());
token.release();
token = std::move(t);
tokens.push_back(token.get());
// std::cout<<t->toString()<<std::endl;
}
public: std::unique_ptr<antlr4::Token> nextToken() override {
public:
std::unique_ptr<antlr4::Token> nextToken() override {
// Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input->LA(1) == EOF && !this->indents.empty()) {
// Remove any trailing EOF tokens from our buffer.
@ -69,10 +68,8 @@ public:
}
}
// First emit an extra line break that serves as the end of the statement.
std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n");
this->emit(std::move(tmp));
emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
// Now emit as much DEDENT tokens as needed.
while (!indents.empty()) {
@ -82,38 +79,30 @@ public:
}
// Put the EOF back on the token stream.
this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
}
std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this->lastToken = next.get();
this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
}
if (tokens.empty()) {
return std::move(next);
} else{
std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
next.release();
// release it because it should be controlled by 'tokens' now
}
auto tmp = tokens.front();
tokens.pop_front();
return std::unique_ptr<antlr4::Token>(tmp);
}
}
private: std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = commonToken(Python3Lexer::DEDENT, "");
dedent->setLine(this->lastToken->getLine());
private:
std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
dedent->setText("DEDENT");
return std::move(dedent);
}
private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
int stop = this->getCharIndex() - 1;
int start = text.empty() ? stop : stop - text.length() + 1;
return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
type,
DEFAULT_TOKEN_CHANNEL, start, stop)));
private:
std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
size_t stop = this->getCharIndex() - 1;
size_t start = text.empty() ? stop : stop - text.length() + 1;
return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
}
// Calculates the indentation of the provided spaces, taking the
@ -124,22 +113,13 @@ public:
// the replacement is a multiple of eight [...]"
//
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(std::string spaces) {
static int getIndentationCount(std::string const &spaces) {
int count = 0;
for (char ch : spaces) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
for (auto ch : spaces)
if (ch == '\t') count += 8 - (count % 8);
else ++count; // normal space char
return count;
}
bool atStartOfInput() {
return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
}

View File

@ -10,25 +10,24 @@ tokens {
}
@lexer::members {
private:
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private: std::list<antlr4::Token*> tokens ;
std::list<antlr4::Token *> tokens;
private:
// The stack that keeps track of the indentation level.
private: std::stack<int> indents ;
std::stack<int> indents;
private:
// The amount of opened braces, brackets and parenthesis.
private: int opened = 0;
// The most recently produced token.
private: antlr4::Token* lastToken = nullptr;
public: void emit(std::unique_ptr<antlr4::Token> t) override {
int opened = 0;
public:
void emit(std::unique_ptr<antlr4::Token> t) override {
tokens.push_back(t.get());
token.release();
token = std::move(t);
tokens.push_back(token.get());
// std::cout<<t->toString()<<std::endl;
}
public: std::unique_ptr<antlr4::Token> nextToken() override {
public:
std::unique_ptr<antlr4::Token> nextToken() override {
// Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input->LA(1) == EOF && !this->indents.empty()) {
// Remove any trailing EOF tokens from our buffer.
@ -40,10 +39,8 @@ tokens {
}
}
// First emit an extra line break that serves as the end of the statement.
std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n");
this->emit(std::move(tmp));
emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
// Now emit as much DEDENT tokens as needed.
while (!indents.empty()) {
@ -53,38 +50,30 @@ tokens {
}
// Put the EOF back on the token stream.
this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
}
std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this->lastToken = next.get();
this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
}
if (tokens.empty()) {
return std::move(next);
} else{
std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
next.release();
// release it because it should be controlled by 'tokens' now
}
auto tmp = tokens.front();
tokens.pop_front();
return std::unique_ptr<antlr4::Token>(tmp);
}
}
private: std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = commonToken(Python3Lexer::DEDENT, "");
dedent->setLine(this->lastToken->getLine());
private:
std::unique_ptr<antlr4::Token> createDedent() {
auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
dedent->setText("DEDENT");
return std::move(dedent);
}
private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
int stop = this->getCharIndex() - 1;
int start = text.empty() ? stop : stop - text.length() + 1;
return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
type,
DEFAULT_TOKEN_CHANNEL, start, stop)));
private:
std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
size_t stop = this->getCharIndex() - 1;
size_t start = text.empty() ? stop : stop - text.length() + 1;
return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
}
// Calculates the indentation of the provided spaces, taking the
@ -95,22 +84,13 @@ tokens {
// the replacement is a multiple of eight [...]"
//
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(std::string spaces) {
static int getIndentationCount(std::string const &spaces) {
int count = 0;
for (char ch : spaces) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
for (auto ch : spaces)
if (ch == '\t') count += 8 - (count % 8);
else ++count; // normal space char
return count;
}
bool atStartOfInput() {
return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
}
@ -147,14 +127,11 @@ NEWLINE: (
{atStartOfInput()}? SPACES
| ( '\r'? '\n' | '\r' | '\f') SPACES?
) {
{
std::string pattern1="[^\r\n\f]+";
std::string pattern2="[\r\n\f]+";
std::regex re1(pattern1);
std::regex re2(pattern2);
std::string fmt="";
std::string newLine=regex_replace(getText(),re1,fmt);
std::string spaces = regex_replace(getText(),re2,fmt);
{ // Braces are required inside the switch
std::regex re1(R"([^\r\n\f]+)");
std::regex re2(R"([\r\n\f]+)");
std::string newLine = regex_replace(getText(), re1, "");
std::string spaces = regex_replace(getText(), re2, "");
int next = _input->LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
@ -162,16 +139,16 @@ NEWLINE: (
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
emit(make_CommonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.empty() ? 0 : indents.top();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
// do nothing
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Lexer::INDENT, spaces));
emit(make_CommonToken(Python3Lexer::INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.