From 4be04cec2d4d5a98997b757f41d7f948f46d17fa Mon Sep 17 00:00:00 2001 From: Wankupi <2893353848@qq.com> Date: Sun, 5 Nov 2023 12:14:29 +0800 Subject: [PATCH] =?UTF-8?q?fix(lexer):=20=E4=BF=AE=E5=A4=8D=E4=BA=86?= =?UTF-8?q?=E8=AF=8D=E6=B3=95=E5=88=86=E6=9E=90=E5=99=A8=E4=B8=AD=E7=9A=84?= =?UTF-8?q?=E4=B8=80=E4=BA=9B=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generated/Python3Lexer.cpp | 67 +++++------ generated/Python3Lexer.h | 174 ++++++++++++-------------- resources/Python3Lexer.g4 | 241 +++++++++++++++++-------------------- 3 files changed, 218 insertions(+), 264 deletions(-) diff --git a/generated/Python3Lexer.cpp b/generated/Python3Lexer.cpp index 6f11176..b3ee877 100644 --- a/generated/Python3Lexer.cpp +++ b/generated/Python3Lexer.cpp @@ -515,42 +515,39 @@ bool Python3Lexer::sempred(RuleContext *context, size_t ruleIndex, size_t predic void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) { switch (actionIndex) { case 0: - { - std::string pattern1="[^\r\n\f]+"; - std::string pattern2="[\r\n\f]+"; - std::regex re1(pattern1); - std::regex re2(pattern2); - std::string fmt=""; - std::string newLine=regex_replace(getText(),re1,fmt); - std::string spaces = regex_replace(getText(),re2,fmt); - int next = _input->LA(1); - if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { - // If we're inside a list or on a blank line, ignore all indents, - // dedents and line breaks. - skip(); - } - else { - emit(commonToken(NEWLINE, newLine)); - int indent = getIndentationCount(spaces); - int previous = indents.empty() ? 0 : indents.top(); - if (indent == previous) { - // skip indents of the same size as the present indent-size - skip(); - } - else if (indent > previous) { - indents.push(indent); - emit(commonToken(Python3Lexer::INDENT, spaces)); - } - else { - // Possibly emit more than 1 DEDENT token. - while(!indents.empty() && indents.top() > indent) { - this->emit(createDedent()); - indents.pop(); - } - } - } + { // Braces are required inside the switch + std::regex re1(R"([^\r\n\f]+)"); + std::regex re2(R"([\r\n\f]+)"); + std::string newLine = regex_replace(getText(), re1, ""); + std::string spaces = regex_replace(getText(), re2, ""); + int next = _input->LA(1); + if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { + // If we're inside a list or on a blank line, ignore all indents, + // dedents and line breaks. + skip(); } - break; + else { + emit(make_CommonToken(NEWLINE, newLine)); + int indent = getIndentationCount(spaces); + int previous = indents.empty() ? 0 : indents.top(); + if (indent == previous) { + // skip indents of the same size as the present indent-size + // do nothing + } + else if (indent > previous) { + indents.push(indent); + emit(make_CommonToken(Python3Lexer::INDENT, spaces)); + } + else { + // Possibly emit more than 1 DEDENT token. + while (!indents.empty() && indents.top() > indent) { + this->emit(createDedent()); + indents.pop(); + } + } + } + } + break; default: break; diff --git a/generated/Python3Lexer.h b/generated/Python3Lexer.h index de9d2cc..77dd4a5 100644 --- a/generated/Python3Lexer.h +++ b/generated/Python3Lexer.h @@ -39,110 +39,90 @@ public: ~Python3Lexer() override; - // A queue where extra tokens are pushed on (see the NEWLINE lexer rule). - private: std::list tokens ; - // The stack that keeps track of the indentation level. - private: std::stack indents ; - // The amount of opened braces, brackets and parenthesis. - private: int opened = 0; - // The most recently produced token. - private: antlr4::Token* lastToken = nullptr; + private: + // A queue where extra tokens are pushed on (see the NEWLINE lexer rule). + std::list tokens; + private: + // The stack that keeps track of the indentation level. + std::stack indents; + private: + // The amount of opened braces, brackets and parenthesis. + int opened = 0; + public: + void emit(std::unique_ptr t) override { + tokens.push_back(t.get()); + token.release(); + token = std::move(t); + } - public: void emit(std::unique_ptr t) override { - token.release(); - token=std::move(t); + public: + std::unique_ptr nextToken() override { + // Check if the end-of-file is ahead and there are still some DEDENTS expected. + if (_input->LA(1) == EOF && !this->indents.empty()) { + // Remove any trailing EOF tokens from our buffer. + for (auto i = tokens.rbegin(); i != tokens.rend();) { + auto tmp = i; + i++; + if ((*tmp)->getType() == EOF) { + tokens.erase(tmp.base()); + } + } - tokens.push_back(token.get()); - // std::cout<toString()<emit(std::move(tmp)); + indents.pop(); + } - public: std::unique_ptr nextToken() override { - // Check if the end-of-file is ahead and there are still some DEDENTS expected. - if (_input->LA(1) == EOF && !this->indents.empty()) { - // Remove any trailing EOF tokens from our buffer. - for(auto i=tokens.rbegin();i!=tokens.rend();){ - auto tmp=i; - i++; - if((*tmp)->getType()==EOF){ - tokens.erase(tmp.base()); - } - } + // Put the EOF back on the token stream. + this->emit(make_CommonToken(static_cast(Python3Lexer::EOF), "")); + } + if (tokens.empty()) { + std::unique_ptr next = Lexer::nextToken(); + next.release(); + // release it because it should be controlled by 'tokens' now + } + auto tmp = tokens.front(); + tokens.pop_front(); + return std::unique_ptr(tmp); + } + private: + std::unique_ptr createDedent() { + auto dedent = make_CommonToken(Python3Lexer::DEDENT, ""); + dedent->setText("DEDENT"); + return std::move(dedent); + } - // First emit an extra line break that serves as the end of the statement. - std::unique_ptr tmp=commonToken(Python3Lexer::NEWLINE, "\n"); - this->emit(std::move(tmp)); + private: + std::unique_ptr make_CommonToken(int type, std::string const &text) { + size_t stop = this->getCharIndex() - 1; + size_t start = text.empty() ? stop : stop - text.length() + 1; + return std::make_unique(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop); + } - // Now emit as much DEDENT tokens as needed. - while (!indents.empty()) { - auto tmp=createDedent(); - this->emit(std::move(tmp)); - indents.pop(); - } - - // Put the EOF back on the token stream. - this->emit(commonToken(static_cast(Python3Lexer::EOF), "")); - } - - std::unique_ptr next = Lexer::nextToken(); - - if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) { - // Keep track of the last token on the default channel. - this->lastToken = next.get(); - } - if (tokens.empty()) { - return std::move(next); - } else{ - next.release(); - auto tmp=tokens.front(); - tokens.pop_front(); - return std::unique_ptr(tmp); - } - - } - - private: std::unique_ptr createDedent() { - auto dedent = commonToken(Python3Lexer::DEDENT, ""); - dedent->setLine(this->lastToken->getLine()); - return std::move(dedent); - } - - private: std::unique_ptr commonToken(int type,std::string text) { - int stop = this->getCharIndex() - 1; - int start = text.empty() ? stop : stop - text.length() + 1; - return std::move(std::unique_ptr(new antlr4::CommonToken({ this, _input }, - type, - DEFAULT_TOKEN_CHANNEL, start, stop))); - } - - // Calculates the indentation of the provided spaces, taking the - // following rules into account: - // - // "Tabs are replaced (from left to right) by one to eight spaces - // such that the total number of characters up to and including - // the replacement is a multiple of eight [...]" - // - // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation - static int getIndentationCount(std::string spaces) { - int count = 0; - for (char ch : spaces) { - switch (ch) { - case '\t': - count += 8 - (count % 8); - break; - default: - // A normal space char. - count++; - } - } - - return count; - } - - bool atStartOfInput() { - return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1; - } + // Calculates the indentation of the provided spaces, taking the + // following rules into account: + // + // "Tabs are replaced (from left to right) by one to eight spaces + // such that the total number of characters up to and including + // the replacement is a multiple of eight [...]" + // + // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation + static int getIndentationCount(std::string const &spaces) { + int count = 0; + for (auto ch : spaces) + if (ch == '\t') count += 8 - (count % 8); + else ++count; // normal space char + return count; + } + bool atStartOfInput() { + return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1; + } std::string getGrammarFileName() const override; diff --git a/resources/Python3Lexer.g4 b/resources/Python3Lexer.g4 index 8270ee4..83732fc 100644 --- a/resources/Python3Lexer.g4 +++ b/resources/Python3Lexer.g4 @@ -10,110 +10,90 @@ tokens { } @lexer::members { - // A queue where extra tokens are pushed on (see the NEWLINE lexer rule). - private: std::list tokens ; - // The stack that keeps track of the indentation level. - private: std::stack indents ; - // The amount of opened braces, brackets and parenthesis. - private: int opened = 0; - // The most recently produced token. - private: antlr4::Token* lastToken = nullptr; +private: + // A queue where extra tokens are pushed on (see the NEWLINE lexer rule). + std::list tokens; +private: + // The stack that keeps track of the indentation level. + std::stack indents; +private: + // The amount of opened braces, brackets and parenthesis. + int opened = 0; +public: + void emit(std::unique_ptr t) override { + tokens.push_back(t.get()); + token.release(); + token = std::move(t); + } - public: void emit(std::unique_ptr t) override { - token.release(); - token=std::move(t); +public: + std::unique_ptr nextToken() override { + // Check if the end-of-file is ahead and there are still some DEDENTS expected. + if (_input->LA(1) == EOF && !this->indents.empty()) { + // Remove any trailing EOF tokens from our buffer. + for (auto i = tokens.rbegin(); i != tokens.rend();) { + auto tmp = i; + i++; + if ((*tmp)->getType() == EOF) { + tokens.erase(tmp.base()); + } + } - tokens.push_back(token.get()); - // std::cout<toString()<emit(std::move(tmp)); + indents.pop(); + } - public: std::unique_ptr nextToken() override { - // Check if the end-of-file is ahead and there are still some DEDENTS expected. - if (_input->LA(1) == EOF && !this->indents.empty()) { - // Remove any trailing EOF tokens from our buffer. - for(auto i=tokens.rbegin();i!=tokens.rend();){ - auto tmp=i; - i++; - if((*tmp)->getType()==EOF){ - tokens.erase(tmp.base()); - } - } + // Put the EOF back on the token stream. + this->emit(make_CommonToken(static_cast(Python3Lexer::EOF), "")); + } + if (tokens.empty()) { + std::unique_ptr next = Lexer::nextToken(); + next.release(); + // release it because it should be controlled by 'tokens' now + } + auto tmp = tokens.front(); + tokens.pop_front(); + return std::unique_ptr(tmp); + } +private: + std::unique_ptr createDedent() { + auto dedent = make_CommonToken(Python3Lexer::DEDENT, ""); + dedent->setText("DEDENT"); + return std::move(dedent); + } - // First emit an extra line break that serves as the end of the statement. - std::unique_ptr tmp=commonToken(Python3Lexer::NEWLINE, "\n"); - this->emit(std::move(tmp)); +private: + std::unique_ptr make_CommonToken(int type, std::string const &text) { + size_t stop = this->getCharIndex() - 1; + size_t start = text.empty() ? stop : stop - text.length() + 1; + return std::make_unique(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop); + } - // Now emit as much DEDENT tokens as needed. - while (!indents.empty()) { - auto tmp=createDedent(); - this->emit(std::move(tmp)); - indents.pop(); - } - - // Put the EOF back on the token stream. - this->emit(commonToken(static_cast(Python3Lexer::EOF), "")); - } - - std::unique_ptr next = Lexer::nextToken(); - - if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) { - // Keep track of the last token on the default channel. - this->lastToken = next.get(); - } - if (tokens.empty()) { - return std::move(next); - } else{ - next.release(); - auto tmp=tokens.front(); - tokens.pop_front(); - return std::unique_ptr(tmp); - } - - } - - private: std::unique_ptr createDedent() { - auto dedent = commonToken(Python3Lexer::DEDENT, ""); - dedent->setLine(this->lastToken->getLine()); - return std::move(dedent); - } - - private: std::unique_ptr commonToken(int type,std::string text) { - int stop = this->getCharIndex() - 1; - int start = text.empty() ? stop : stop - text.length() + 1; - return std::move(std::unique_ptr(new antlr4::CommonToken({ this, _input }, - type, - DEFAULT_TOKEN_CHANNEL, start, stop))); - } - - // Calculates the indentation of the provided spaces, taking the - // following rules into account: - // - // "Tabs are replaced (from left to right) by one to eight spaces - // such that the total number of characters up to and including - // the replacement is a multiple of eight [...]" - // - // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation - static int getIndentationCount(std::string spaces) { - int count = 0; - for (char ch : spaces) { - switch (ch) { - case '\t': - count += 8 - (count % 8); - break; - default: - // A normal space char. - count++; - } - } - - return count; - } - - bool atStartOfInput() { - return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1; - } + // Calculates the indentation of the provided spaces, taking the + // following rules into account: + // + // "Tabs are replaced (from left to right) by one to eight spaces + // such that the total number of characters up to and including + // the replacement is a multiple of eight [...]" + // + // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation + static int getIndentationCount(std::string const &spaces) { + int count = 0; + for (auto ch : spaces) + if (ch == '\t') count += 8 - (count % 8); + else ++count; // normal space char + return count; + } + bool atStartOfInput() { + return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1; + } } STRING: STRING_LITERAL | BYTES_LITERAL; @@ -147,42 +127,39 @@ NEWLINE: ( {atStartOfInput()}? SPACES | ( '\r'? '\n' | '\r' | '\f') SPACES? ) { - { - std::string pattern1="[^\r\n\f]+"; - std::string pattern2="[\r\n\f]+"; - std::regex re1(pattern1); - std::regex re2(pattern2); - std::string fmt=""; - std::string newLine=regex_replace(getText(),re1,fmt); - std::string spaces = regex_replace(getText(),re2,fmt); - int next = _input->LA(1); - if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { - // If we're inside a list or on a blank line, ignore all indents, - // dedents and line breaks. - skip(); - } - else { - emit(commonToken(NEWLINE, newLine)); - int indent = getIndentationCount(spaces); - int previous = indents.empty() ? 0 : indents.top(); - if (indent == previous) { - // skip indents of the same size as the present indent-size - skip(); - } - else if (indent > previous) { - indents.push(indent); - emit(commonToken(Python3Lexer::INDENT, spaces)); - } - else { - // Possibly emit more than 1 DEDENT token. - while(!indents.empty() && indents.top() > indent) { - this->emit(createDedent()); - indents.pop(); - } - } - } +{ // Braces are required inside the switch + std::regex re1(R"([^\r\n\f]+)"); + std::regex re2(R"([\r\n\f]+)"); + std::string newLine = regex_replace(getText(), re1, ""); + std::string spaces = regex_replace(getText(), re2, ""); + int next = _input->LA(1); + if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { + // If we're inside a list or on a blank line, ignore all indents, + // dedents and line breaks. + skip(); } - }; + else { + emit(make_CommonToken(NEWLINE, newLine)); + int indent = getIndentationCount(spaces); + int previous = indents.empty() ? 0 : indents.top(); + if (indent == previous) { + // skip indents of the same size as the present indent-size + // do nothing + } + else if (indent > previous) { + indents.push(indent); + emit(make_CommonToken(Python3Lexer::INDENT, spaces)); + } + else { + // Possibly emit more than 1 DEDENT token. + while (!indents.empty() && indents.top() > indent) { + this->emit(createDedent()); + indents.pop(); + } + } + } +} +}; /// identifier ::= id_start id_continue* NAME: ID_START ID_CONTINUE*;