From 9dbd77c0d1123af5963651fc0a3331d1473d3c63 Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Tue, 7 Aug 2007 13:04:12 +0000 Subject: [PATCH] C# parser: allow unicode escape sequences in identifier names (yes, "int numb\u0065r;" is valid C#!) git-svn-id: svn://svn.sharpdevelop.net/sharpdevelop/branches/2.1@2637 1ccf3a8d-04fe-1044-b7c0-cef0b8235c61 --- .../Project/Src/Lexer/CSharp/Lexer.cs | 60 +++++++++++++++---- .../Test/Lexer/CSharp/CustomLexerTests.cs | 48 ++++++++++----- 2 files changed, 79 insertions(+), 29 deletions(-) diff --git a/src/Libraries/NRefactory/Project/Src/Lexer/CSharp/Lexer.cs b/src/Libraries/NRefactory/Project/Src/Lexer/CSharp/Lexer.cs index 19e993c9ff..03a3fdd88e 100644 --- a/src/Libraries/NRefactory/Project/Src/Lexer/CSharp/Lexer.cs +++ b/src/Libraries/NRefactory/Project/Src/Lexer/CSharp/Lexer.cs @@ -21,7 +21,8 @@ namespace ICSharpCode.NRefactory.Parser.CSharp void ReadPreProcessingDirective() { Location start = new Location(Col - 1, Line); - string directive = ReadIdent('#'); + bool canBeKeyword; + string directive = ReadIdent('#', out canBeKeyword); string argument = ReadToEndOfLine(); this.specialTracker.AddPreprocessingDirective(directive, argument.Trim(), start, new Location(start.X + directive.Length + argument.Length, start.Y)); } @@ -80,7 +81,8 @@ namespace ICSharpCode.NRefactory.Parser.CSharp if (ch == '"') { token = ReadVerbatimString(); } else if (Char.IsLetterOrDigit(ch) || ch == '_') { - token = new Token(Tokens.Identifier, x - 1, y, ReadIdent(ch)); + bool canBeKeyword; + token = new Token(Tokens.Identifier, x - 1, y, ReadIdent(ch, out canBeKeyword)); } else { errors.Error(y, x, String.Format("Unexpected char in Lexer.Next() : {0}", ch)); continue; @@ -89,13 +91,16 @@ namespace ICSharpCode.NRefactory.Parser.CSharp break; default: ch = (char)nextChar; - if (Char.IsLetter(ch) || ch == '_') { + if (Char.IsLetter(ch) || ch == '_' || ch == '\\') { int x = Col - 1; // Col was incremented above, but we want the start of the identifier int y = Line; - string s = ReadIdent(ch); - int keyWordToken = Keywords.GetToken(s); - if (keyWordToken >= 0) { - return new Token(keyWordToken, x, y); + bool canBeKeyword; + string s = ReadIdent(ch, out canBeKeyword); + if (canBeKeyword) { + int keyWordToken = Keywords.GetToken(s); + if (keyWordToken >= 0) { + return new Token(keyWordToken, x, y); + } } return new Token(Tokens.Identifier, x, y, s); } else if (Char.IsDigit(ch)) { @@ -120,16 +125,39 @@ namespace ICSharpCode.NRefactory.Parser.CSharp const int MAX_IDENTIFIER_LENGTH = 512; char[] identBuffer = new char[MAX_IDENTIFIER_LENGTH]; - string ReadIdent(char ch) + string ReadIdent(char ch, out bool canBeKeyword) { int peek; - int curPos = 1; - identBuffer[0] = ch; - while (IsIdentifierPart(peek = ReaderPeek())) { - ReaderRead(); + int curPos = 0; + canBeKeyword = true; + while (true) { + if (ch == '\\') { + peek = ReaderPeek(); + if (peek != 'u' && peek != 'U') { + errors.Error(Line, Col, "Identifiers can only contain unicode escape sequences"); + } + canBeKeyword = false; + string surrogatePair; + ReadEscapeSequence(out ch, out surrogatePair); + if (surrogatePair != null) { + if (!char.IsLetterOrDigit(surrogatePair, 0)) { + errors.Error(Line, Col, "Unicode escape sequences in identifiers cannot be used to represent characters that are invalid in identifiers"); + } + for (int i = 0; i < surrogatePair.Length - 1; i++) { + if (curPos < MAX_IDENTIFIER_LENGTH) { + identBuffer[curPos++] = surrogatePair[i]; + } + } + ch = surrogatePair[surrogatePair.Length - 1]; + } else { + if (!IsIdentifierPart(ch)) { + errors.Error(Line, Col, "Unicode escape sequences in identifiers cannot be used to represent characters that are invalid in identifiers"); + } + } + } if (curPos < MAX_IDENTIFIER_LENGTH) { - identBuffer[curPos++] = (char)peek; + identBuffer[curPos++] = ch; } else { errors.Error(Line, Col, String.Format("Identifier too long")); while (IsIdentifierPart(ReaderPeek())) { @@ -137,6 +165,12 @@ namespace ICSharpCode.NRefactory.Parser.CSharp } break; } + peek = ReaderPeek(); + if (IsIdentifierPart(peek) || peek == '\\') { + ch = (char)ReaderRead(); + } else { + break; + } } return new String(identBuffer, 0, curPos); } diff --git a/src/Libraries/NRefactory/Test/Lexer/CSharp/CustomLexerTests.cs b/src/Libraries/NRefactory/Test/Lexer/CSharp/CustomLexerTests.cs index f5864e2db5..528647acd4 100644 --- a/src/Libraries/NRefactory/Test/Lexer/CSharp/CustomLexerTests.cs +++ b/src/Libraries/NRefactory/Test/Lexer/CSharp/CustomLexerTests.cs @@ -32,41 +32,57 @@ namespace ICSharpCode.NRefactory.Tests.Lexer.CSharp Assert.AreEqual(Tokens.EOF, lexer.NextToken().kind); } - [Test] - public void TestIdentifier() + void CheckIdentifier(string text, string actualIdentifier) { - ILexer lexer = GenerateLexer(new StringReader("a_Bc05")); + ILexer lexer = GenerateLexer(new StringReader(text)); Token t = lexer.NextToken(); Assert.AreEqual(Tokens.Identifier, t.kind); - Assert.AreEqual("a_Bc05", t.val); + Assert.AreEqual(actualIdentifier, t.val); + t = lexer.NextToken(); + Assert.AreEqual(Tokens.EOF, t.kind); + Assert.AreEqual("", lexer.Errors.ErrorOutput); + } + + [Test] + public void TestIdentifier() + { + CheckIdentifier("a_Bc05", "a_Bc05"); } [Test] public void TestIdentifierStartingWithUnderscore() { - ILexer lexer = GenerateLexer(new StringReader("_Bc05")); - Token t = lexer.NextToken(); - Assert.AreEqual(Tokens.Identifier, t.kind); - Assert.AreEqual("_Bc05", t.val); + CheckIdentifier("_Bc05", "_Bc05"); + } + + [Test] + public void TestIdentifierStartingWithEscapeSequence() + { + CheckIdentifier(@"\u006cexer", "lexer"); + } + + [Test] + public void TestIdentifierContainingEscapeSequence() + { + CheckIdentifier(@"l\U00000065xer", "lexer"); } [Test] public void TestKeyWordAsIdentifier() { - ILexer lexer = GenerateLexer(new StringReader("@int")); - Token t = lexer.NextToken(); - Assert.AreEqual(Tokens.Identifier, t.kind); - Assert.AreEqual("int", t.val); + CheckIdentifier("@int", "int"); } + [Test] + public void TestKeywordWithEscapeSequenceIsIdentifier() + { + CheckIdentifier(@"i\u006et", "int"); + } [Test] public void TestKeyWordAsIdentifierStartingWithUnderscore() { - ILexer lexer = GenerateLexer(new StringReader("@_int")); - Token t = lexer.NextToken(); - Assert.AreEqual(Tokens.Identifier, t.kind); - Assert.AreEqual("_int", t.val); + CheckIdentifier("@_int", "_int"); } [Test]