fix(lexer): 修复了词法分析器中的一些问题

2023-11-05 12:14:29 +08:00
parent 2e29af68b3
commit 4be04cec2d
3 changed files with 218 additions and 264 deletions
--- a/generated/Python3Lexer.cpp
+++ b/generated/Python3Lexer.cpp
@@ -515,14 +515,11 @@ bool Python3Lexer::sempred(RuleContext *context, size_t ruleIndex, size_t predic
 void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) {
  switch (actionIndex) {
    case 0: 
-    	{
+    { // Braces are required inside the switch
-         std::string pattern1="[^\r\n\f]+";
+    	std::regex re1(R"([^\r\n\f]+)");
-         std::string pattern2="[\r\n\f]+";
+    	std::regex re2(R"([\r\n\f]+)");
-         std::regex re1(pattern1);
+    	std::string newLine = regex_replace(getText(), re1, "");
-         std::regex re2(pattern2);
+    	std::string spaces = regex_replace(getText(), re2, "");
         std::string fmt="";
         std::string newLine=regex_replace(getText(),re1,fmt);
          std::string spaces = regex_replace(getText(),re2,fmt);
    	int next = _input->LA(1);
    	if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
    		// If we're inside a list or on a blank line, ignore all indents,
@@ -530,16 +527,16 @@ void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionInde
    		skip();
    	}
    	else {
-            emit(commonToken(NEWLINE, newLine));
+    		emit(make_CommonToken(NEWLINE, newLine));
    		int indent = getIndentationCount(spaces);
    		int previous = indents.empty() ? 0 : indents.top();
    		if (indent == previous) {
    			// skip indents of the same size as the present indent-size
-              skip();
+    			// do nothing
    		}
    		else if (indent > previous) {
    			indents.push(indent);
-              emit(commonToken(Python3Lexer::INDENT, spaces));
+    			emit(make_CommonToken(Python3Lexer::INDENT, spaces));
    		}
    		else {
    			// Possibly emit more than 1 DEDENT token.
--- a/generated/Python3Lexer.h
+++ b/generated/Python3Lexer.h
@@ -39,25 +39,24 @@ public:
  ~Python3Lexer() override;
  private:
  	// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
-   private: std::list<antlr4::Token*> tokens ;
+  	std::list<antlr4::Token *> tokens;
  private:
  	// The stack that keeps track of the indentation level.
-   private: std::stack<int> indents ;
+  	std::stack<int> indents;
  private:
  	// The amount of opened braces, brackets and parenthesis.
-   private: int opened = 0;
+  	int opened = 0;
-       // The most recently produced token.
+  public:
-   private: antlr4::Token* lastToken = nullptr;
+  	void emit(std::unique_ptr<antlr4::Token> t) override {
-
+  		tokens.push_back(t.get());
   public: void emit(std::unique_ptr<antlr4::Token> t) override {
  		token.release();
  		token = std::move(t);
         tokens.push_back(token.get());
   //      std::cout<<t->toString()<<std::endl;
  	}
-
+  public:
-   public: std::unique_ptr<antlr4::Token> nextToken() override {
+  	std::unique_ptr<antlr4::Token> nextToken() override {
  		// Check if the end-of-file is ahead and there are still some DEDENTS expected.
  		if (_input->LA(1) == EOF && !this->indents.empty()) {
  			// Remove any trailing EOF tokens from our buffer.
@@ -69,10 +68,8 @@ public:
  				}
  			}
  			// First emit an extra line break that serves as the end of the statement.
-           std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n");
+  			emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
           this->emit(std::move(tmp));
  			// Now emit as much DEDENT tokens as needed.
  			while (!indents.empty()) {
@@ -82,38 +79,30 @@ public:
  			}
  			// Put the EOF back on the token stream.
-           this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
+  			this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
         }
         std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
         if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
           // Keep track of the last token on the default channel.
           this->lastToken = next.get();
  		}
  		if (tokens.empty()) {
-               return std::move(next);
+  			std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
           } else{
  			next.release();
  			// release it because it should be controlled by 'tokens' now
  		}
  		auto tmp = tokens.front();
  		tokens.pop_front();
  		return std::unique_ptr<antlr4::Token>(tmp);
  	}
-       }
+  private:
-
+  	std::unique_ptr<antlr4::Token> createDedent() {
-   private: std::unique_ptr<antlr4::Token> createDedent() {
+  		auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
-         auto dedent = commonToken(Python3Lexer::DEDENT, "");
+  		dedent->setText("DEDENT");
         dedent->setLine(this->lastToken->getLine());
  		return std::move(dedent);
  	}
-   private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
+  private:
-         int stop = this->getCharIndex() - 1;
+  	std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
-         int start = text.empty() ? stop : stop - text.length() + 1;
+  		size_t stop = this->getCharIndex() - 1;
-         return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
+  		size_t start = text.empty() ? stop : stop - text.length() + 1;
-                 type,
+  		return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
                 DEFAULT_TOKEN_CHANNEL, start, stop)));
  	}
  	// Calculates the indentation of the provided spaces, taking the
@@ -124,22 +113,13 @@ public:
  	//  the replacement is a multiple of eight [...]"
  	//
  	//  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
-       static int getIndentationCount(std::string spaces) {
+  	static int getIndentationCount(std::string const &spaces) {
  		int count = 0;
-         for (char ch : spaces) {
+  		for (auto ch : spaces)
-           switch (ch) {
+  			if (ch == '\t') count += 8 - (count % 8);
-             case '\t':
+  			else ++count; // normal space char
               count += 8 - (count % 8);
               break;
             default:
               // A normal space char.
               count++;
           }
         }
  		return count;
  	}
  	bool atStartOfInput() {
  		return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
  	}
--- a/resources/Python3Lexer.g4
+++ b/resources/Python3Lexer.g4
@@ -10,25 +10,24 @@ tokens {
 }
@lexer::members {
 private:
 	// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
- private: std::list<antlr4::Token*> tokens ;
+	std::list<antlr4::Token *> tokens;
 private:
 	// The stack that keeps track of the indentation level.
- private: std::stack<int> indents ;
+	std::stack<int> indents;
 private:
 	// The amount of opened braces, brackets and parenthesis.
- private: int opened = 0;
+	int opened = 0;
-     // The most recently produced token.
+public:
- private: antlr4::Token* lastToken = nullptr;
+	void emit(std::unique_ptr<antlr4::Token> t) override {
-
+		tokens.push_back(t.get());
 public: void emit(std::unique_ptr<antlr4::Token> t) override {
 		token.release();
 		token = std::move(t);
       tokens.push_back(token.get());
 //      std::cout<<t->toString()<<std::endl;
 	}
-
+public:
- public: std::unique_ptr<antlr4::Token> nextToken() override {
+	std::unique_ptr<antlr4::Token> nextToken() override {
 		// Check if the end-of-file is ahead and there are still some DEDENTS expected.
 		if (_input->LA(1) == EOF && !this->indents.empty()) {
 			// Remove any trailing EOF tokens from our buffer.
@@ -40,10 +39,8 @@ tokens {
 				}
 			}
 			// First emit an extra line break that serves as the end of the statement.
-         std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n");
+			emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));
         this->emit(std::move(tmp));
 			// Now emit as much DEDENT tokens as needed.
 			while (!indents.empty()) {
@@ -53,38 +50,30 @@ tokens {
 			}
 			// Put the EOF back on the token stream.
-         this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
+			this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
       }
       std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
       if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
         // Keep track of the last token on the default channel.
         this->lastToken = next.get();
 		}
 		if (tokens.empty()) {
-             return std::move(next);
+			std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
         } else{
 			next.release();
 			// release it because it should be controlled by 'tokens' now
 		}
 		auto tmp = tokens.front();
 		tokens.pop_front();
 		return std::unique_ptr<antlr4::Token>(tmp);
 	}
-     }
+private:
-
+	std::unique_ptr<antlr4::Token> createDedent() {
- private: std::unique_ptr<antlr4::Token> createDedent() {
+		auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
-       auto dedent = commonToken(Python3Lexer::DEDENT, "");
+		dedent->setText("DEDENT");
       dedent->setLine(this->lastToken->getLine());
 		return std::move(dedent);
 	}
- private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
+private:
-       int stop = this->getCharIndex() - 1;
+	std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
-       int start = text.empty() ? stop : stop - text.length() + 1;
+		size_t stop = this->getCharIndex() - 1;
-       return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
+		size_t start = text.empty() ? stop : stop - text.length() + 1;
-               type,
+		return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
               DEFAULT_TOKEN_CHANNEL, start, stop)));
 	}
 	// Calculates the indentation of the provided spaces, taking the
@@ -95,22 +84,13 @@ tokens {
 	//  the replacement is a multiple of eight [...]"
 	//
 	//  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
-     static int getIndentationCount(std::string spaces) {
+	static int getIndentationCount(std::string const &spaces) {
 		int count = 0;
-       for (char ch : spaces) {
+		for (auto ch : spaces)
-         switch (ch) {
+			if (ch == '\t') count += 8 - (count % 8);
-           case '\t':
+			else ++count; // normal space char
             count += 8 - (count % 8);
             break;
           default:
             // A normal space char.
             count++;
         }
       }
 		return count;
 	}
 	bool atStartOfInput() {
 		return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
 	}
@@ -147,14 +127,11 @@ NEWLINE: (
 		{atStartOfInput()}? SPACES
 		| ( '\r'? '\n' | '\r' | '\f') SPACES?
 	) {
-	{
+{ // Braces are required inside the switch
-     std::string pattern1="[^\r\n\f]+";
+	std::regex re1(R"([^\r\n\f]+)");
-     std::string pattern2="[\r\n\f]+";
+	std::regex re2(R"([\r\n\f]+)");
-     std::regex re1(pattern1);
+	std::string newLine = regex_replace(getText(), re1, "");
-     std::regex re2(pattern2);
+	std::string spaces = regex_replace(getText(), re2, "");
     std::string fmt="";
     std::string newLine=regex_replace(getText(),re1,fmt);
      std::string spaces = regex_replace(getText(),re2,fmt);
 	int next = _input->LA(1);
 	if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
 		// If we're inside a list or on a blank line, ignore all indents,
@@ -162,16 +139,16 @@ NEWLINE: (
 		skip();
 	}
 	else {
-        emit(commonToken(NEWLINE, newLine));
+		emit(make_CommonToken(NEWLINE, newLine));
 		int indent = getIndentationCount(spaces);
 		int previous = indents.empty() ? 0 : indents.top();
 		if (indent == previous) {
 			// skip indents of the same size as the present indent-size
-          skip();
+			// do nothing
 		}
 		else if (indent > previous) {
 			indents.push(indent);
-          emit(commonToken(Python3Lexer::INDENT, spaces));
+			emit(make_CommonToken(Python3Lexer::INDENT, spaces));
 		}
 		else {
 			// Possibly emit more than 1 DEDENT token.