fix(lexer): 修复了词法分析器中的一些问题

2023-11-05 12:14:29 +08:00
parent 2e29af68b3
commit 4be04cec2d
3 changed files with 218 additions and 264 deletions
--- a/generated/Python3Lexer.cpp
+++ b/generated/Python3Lexer.cpp
@ -515,14 +515,11 @@ bool Python3Lexer::sempred(RuleContext *context, size_t ruleIndex, size_t predic
 void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionIndex) {
  switch (actionIndex) {
    case 0: 
-    	{
-         std::string pattern1="[^\r\n\f]+";
-         std::string pattern2="[\r\n\f]+";
-         std::regex re1(pattern1);
-         std::regex re2(pattern2);
-         std::string fmt="";
-         std::string newLine=regex_replace(getText(),re1,fmt);
-          std::string spaces = regex_replace(getText(),re2,fmt);
+    { // Braces are required inside the switch
+    	std::regex re1(R"([^\r\n\f]+)");
+    	std::regex re2(R"([\r\n\f]+)");
+    	std::string newLine = regex_replace(getText(), re1, "");
+    	std::string spaces = regex_replace(getText(), re2, "");
    	int next = _input->LA(1);
    	if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
    		// If we're inside a list or on a blank line, ignore all indents,
@ -530,16 +527,16 @@ void Python3Lexer::NEWLINEAction(antlr4::RuleContext *context, size_t actionInde
    		skip();
    	}
    	else {
-            emit(commonToken(NEWLINE, newLine));
+    		emit(make_CommonToken(NEWLINE, newLine));
    		int indent = getIndentationCount(spaces);
    		int previous = indents.empty() ? 0 : indents.top();
    		if (indent == previous) {
    			// skip indents of the same size as the present indent-size
-              skip();
+    			// do nothing
    		}
    		else if (indent > previous) {
    			indents.push(indent);
-              emit(commonToken(Python3Lexer::INDENT, spaces));
+    			emit(make_CommonToken(Python3Lexer::INDENT, spaces));
    		}
    		else {
    			// Possibly emit more than 1 DEDENT token.
--- a/generated/Python3Lexer.h
+++ b/generated/Python3Lexer.h
@ -39,25 +39,24 @@ public:
  ~Python3Lexer() override;


+  private:
  	// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
-   private: std::list<antlr4::Token*> tokens ;
+  	std::list<antlr4::Token *> tokens;
+  private:
  	// The stack that keeps track of the indentation level.
-   private: std::stack<int> indents ;
+  	std::stack<int> indents;
+  private:
  	// The amount of opened braces, brackets and parenthesis.
-   private: int opened = 0;
-       // The most recently produced token.
-   private: antlr4::Token* lastToken = nullptr;
-
-   public: void emit(std::unique_ptr<antlr4::Token> t) override {
+  	int opened = 0;
+  public:
+  	void emit(std::unique_ptr<antlr4::Token> t) override {
+  		tokens.push_back(t.get());
  		token.release();
  		token = std::move(t);
-
-         tokens.push_back(token.get());
-   //      std::cout<<t->toString()<<std::endl;
  	}

-
-   public: std::unique_ptr<antlr4::Token> nextToken() override {
+  public:
+  	std::unique_ptr<antlr4::Token> nextToken() override {
  		// Check if the end-of-file is ahead and there are still some DEDENTS expected.
  		if (_input->LA(1) == EOF && !this->indents.empty()) {
  			// Remove any trailing EOF tokens from our buffer.
@ -69,10 +68,8 @@ public:
  				}
  			}

-
  			// First emit an extra line break that serves as the end of the statement.
-           std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n");
-           this->emit(std::move(tmp));
+  			emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));

  			// Now emit as much DEDENT tokens as needed.
  			while (!indents.empty()) {
@ -82,38 +79,30 @@ public:
  			}

  			// Put the EOF back on the token stream.
-           this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
-         }
-
-         std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
-
-         if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
-           // Keep track of the last token on the default channel.
-           this->lastToken = next.get();
+  			this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
  		}
  		if (tokens.empty()) {
-               return std::move(next);
-           } else{
+  			std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
  			next.release();
+  			// release it because it should be controlled by 'tokens' now
+  		}
  		auto tmp = tokens.front();
  		tokens.pop_front();
  		return std::unique_ptr<antlr4::Token>(tmp);
  	}

-       }
-
-   private: std::unique_ptr<antlr4::Token> createDedent() {
-         auto dedent = commonToken(Python3Lexer::DEDENT, "");
-         dedent->setLine(this->lastToken->getLine());
+  private:
+  	std::unique_ptr<antlr4::Token> createDedent() {
+  		auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
+  		dedent->setText("DEDENT");
  		return std::move(dedent);
  	}

-   private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
-         int stop = this->getCharIndex() - 1;
-         int start = text.empty() ? stop : stop - text.length() + 1;
-         return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
-                 type,
-                 DEFAULT_TOKEN_CHANNEL, start, stop)));
+  private:
+  	std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
+  		size_t stop = this->getCharIndex() - 1;
+  		size_t start = text.empty() ? stop : stop - text.length() + 1;
+  		return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
  	}

  	// Calculates the indentation of the provided spaces, taking the
@ -124,22 +113,13 @@ public:
  	//  the replacement is a multiple of eight [...]"
  	//
  	//  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
-       static int getIndentationCount(std::string spaces) {
+  	static int getIndentationCount(std::string const &spaces) {
  		int count = 0;
-         for (char ch : spaces) {
-           switch (ch) {
-             case '\t':
-               count += 8 - (count % 8);
-               break;
-             default:
-               // A normal space char.
-               count++;
-           }
-         }
-
+  		for (auto ch : spaces)
+  			if (ch == '\t') count += 8 - (count % 8);
+  			else ++count; // normal space char
  		return count;
  	}
-
  	bool atStartOfInput() {
  		return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
  	}
--- a/resources/Python3Lexer.g4
+++ b/resources/Python3Lexer.g4
@ -10,25 +10,24 @@ tokens {
 }

@lexer::members {
+private:
 	// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
- private: std::list<antlr4::Token*> tokens ;
+	std::list<antlr4::Token *> tokens;
+private:
 	// The stack that keeps track of the indentation level.
- private: std::stack<int> indents ;
+	std::stack<int> indents;
+private:
 	// The amount of opened braces, brackets and parenthesis.
- private: int opened = 0;
-     // The most recently produced token.
- private: antlr4::Token* lastToken = nullptr;
-
- public: void emit(std::unique_ptr<antlr4::Token> t) override {
+	int opened = 0;
+public:
+	void emit(std::unique_ptr<antlr4::Token> t) override {
+		tokens.push_back(t.get());
 		token.release();
 		token = std::move(t);
-
-       tokens.push_back(token.get());
- //      std::cout<<t->toString()<<std::endl;
 	}

-
- public: std::unique_ptr<antlr4::Token> nextToken() override {
+public:
+	std::unique_ptr<antlr4::Token> nextToken() override {
 		// Check if the end-of-file is ahead and there are still some DEDENTS expected.
 		if (_input->LA(1) == EOF && !this->indents.empty()) {
 			// Remove any trailing EOF tokens from our buffer.
@ -40,10 +39,8 @@ tokens {
 				}
 			}

-
 			// First emit an extra line break that serves as the end of the statement.
-         std::unique_ptr<antlr4::Token> tmp=commonToken(Python3Lexer::NEWLINE, "\n");
-         this->emit(std::move(tmp));
+			emit(make_CommonToken(Python3Lexer::NEWLINE, "\n"));

 			// Now emit as much DEDENT tokens as needed.
 			while (!indents.empty()) {
@ -53,38 +50,30 @@ tokens {
 			}

 			// Put the EOF back on the token stream.
-         this->emit(commonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
-       }
-
-       std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
-
-       if (next->getChannel() == antlr4::Token::DEFAULT_CHANNEL) {
-         // Keep track of the last token on the default channel.
-         this->lastToken = next.get();
+			this->emit(make_CommonToken(static_cast<int>(Python3Lexer::EOF), "<EOF>"));
 		}
 		if (tokens.empty()) {
-             return std::move(next);
-         } else{
+			std::unique_ptr<antlr4::Token> next = Lexer::nextToken();
 			next.release();
+			// release it because it should be controlled by 'tokens' now
+		}
 		auto tmp = tokens.front();
 		tokens.pop_front();
 		return std::unique_ptr<antlr4::Token>(tmp);
 	}

-     }
-
- private: std::unique_ptr<antlr4::Token> createDedent() {
-       auto dedent = commonToken(Python3Lexer::DEDENT, "");
-       dedent->setLine(this->lastToken->getLine());
+private:
+	std::unique_ptr<antlr4::Token> createDedent() {
+		auto dedent = make_CommonToken(Python3Lexer::DEDENT, "");
+		dedent->setText("DEDENT");
 		return std::move(dedent);
 	}

- private: std::unique_ptr<antlr4::CommonToken> commonToken(int type,std::string text) {
-       int stop = this->getCharIndex() - 1;
-       int start = text.empty() ? stop : stop - text.length() + 1;
-       return std::move(std::unique_ptr<antlr4::CommonToken>(new antlr4::CommonToken({ this, _input },
-               type,
-               DEFAULT_TOKEN_CHANNEL, start, stop)));
+private:
+	std::unique_ptr<antlr4::CommonToken> make_CommonToken(int type, std::string const &text) {
+		size_t stop = this->getCharIndex() - 1;
+		size_t start = text.empty() ? stop : stop - text.length() + 1;
+		return std::make_unique<antlr4::CommonToken>(std::make_pair(this, _input), type, DEFAULT_TOKEN_CHANNEL, start, stop);
 	}

 	// Calculates the indentation of the provided spaces, taking the
@ -95,22 +84,13 @@ tokens {
 	//  the replacement is a multiple of eight [...]"
 	//
 	//  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
-     static int getIndentationCount(std::string spaces) {
+	static int getIndentationCount(std::string const &spaces) {
 		int count = 0;
-       for (char ch : spaces) {
-         switch (ch) {
-           case '\t':
-             count += 8 - (count % 8);
-             break;
-           default:
-             // A normal space char.
-             count++;
-         }
-       }
-
+		for (auto ch : spaces)
+			if (ch == '\t') count += 8 - (count % 8);
+			else ++count; // normal space char
 		return count;
 	}
-
 	bool atStartOfInput() {
 		return Lexer::getCharPositionInLine() == 0 && Lexer::getLine() == 1;
 	}
@ -147,14 +127,11 @@ NEWLINE: (
 		{atStartOfInput()}? SPACES
 		| ( '\r'? '\n' | '\r' | '\f') SPACES?
 	) {
-	{
-     std::string pattern1="[^\r\n\f]+";
-     std::string pattern2="[\r\n\f]+";
-     std::regex re1(pattern1);
-     std::regex re2(pattern2);
-     std::string fmt="";
-     std::string newLine=regex_replace(getText(),re1,fmt);
-      std::string spaces = regex_replace(getText(),re2,fmt);
+{ // Braces are required inside the switch
+	std::regex re1(R"([^\r\n\f]+)");
+	std::regex re2(R"([\r\n\f]+)");
+	std::string newLine = regex_replace(getText(), re1, "");
+	std::string spaces = regex_replace(getText(), re2, "");
 	int next = _input->LA(1);
 	if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
 		// If we're inside a list or on a blank line, ignore all indents,
@ -162,16 +139,16 @@ NEWLINE: (
 		skip();
 	}
 	else {
-        emit(commonToken(NEWLINE, newLine));
+		emit(make_CommonToken(NEWLINE, newLine));
 		int indent = getIndentationCount(spaces);
 		int previous = indents.empty() ? 0 : indents.top();
 		if (indent == previous) {
 			// skip indents of the same size as the present indent-size
-          skip();
+			// do nothing
 		}
 		else if (indent > previous) {
 			indents.push(indent);
-          emit(commonToken(Python3Lexer::INDENT, spaces));
+			emit(make_CommonToken(Python3Lexer::INDENT, spaces));
 		}
 		else {
 			// Possibly emit more than 1 DEDENT token.