From b39e888905efaa9883fd26d31a77bde45675c4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Srbeck=C3=BD?= Date: Thu, 6 Aug 2009 13:07:45 +0000 Subject: [PATCH] XML Parser: Rechecked all parse functions and tried to make them more error prone; Added a lot TODOs for the missing functionality. git-svn-id: svn://svn.sharpdevelop.net/sharpdevelop/trunk@4604 1ccf3a8d-04fe-1044-b7c0-cef0b8235c61 --- .../XmlParser/RawObjects.cs | 34 ++- .../XmlParser/XmlParser.cs | 233 ++++++++++++------ 2 files changed, 179 insertions(+), 88 deletions(-) diff --git a/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs b/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs index 206b36eab6..e7f6ca3bd0 100644 --- a/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs +++ b/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs @@ -8,14 +8,13 @@ using System; using System.Collections.Generic; using System.Collections.ObjectModel; -using System.Collections.Specialized; using System.Linq; using System.Xml; using System.Xml.Linq; using ICSharpCode.AvalonEdit.Document; -// Missing XML comment +// TODO: Missing XML comment #pragma warning disable 1591 namespace ICSharpCode.AvalonEdit.XmlParser @@ -106,13 +105,15 @@ namespace ICSharpCode.AvalonEdit.XmlParser protected XName EncodeXName(string name) { + if (string.IsNullOrEmpty(name)) name = "_"; + string namesapce = string.Empty; int colonIndex = name.IndexOf(':'); if (colonIndex != -1) { namesapce = name.Substring(0, colonIndex); name = name.Substring(colonIndex + 1); } - if (string.IsNullOrEmpty(name)) name = "_"; + name = XmlConvert.EncodeLocalName(name); namesapce = XmlConvert.EncodeLocalName(namesapce); return XName.Get(name, namesapce); @@ -452,9 +453,12 @@ namespace ICSharpCode.AvalonEdit.XmlParser { if (!firstUpdate) LogLinq("Updating XElement Attributes of '{0}'", this.StartTag.Name); - xElem.ReplaceAttributes(); // Otherwise we get duplicate item exception - XAttribute[] attrs = this.StartTag.Children.OfType().Select(x => x.GetXAttribute()).Distinct(new AttributeNameComparer()).ToArray(); - xElem.ReplaceAttributes(attrs); + // TODO: Investigate null + if (xElem != null) { + xElem.ReplaceAttributes(); // Otherwise we get duplicate item exception + XAttribute[] attrs = this.StartTag.Children.OfType().Select(x => x.GetXAttribute()).Distinct(new AttributeNameComparer()).ToArray(); + xElem.ReplaceAttributes(attrs); + } } void UpdateXElementChildren(bool firstUpdate) @@ -494,6 +498,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser public string EqualsSign { get; set; } public string Value { get; set; } + // TODO: Provide method to dereference Value - & + public override void UpdateDataFrom(RawObject source) { if (this.ReadCallID == source.ReadCallID) return; @@ -548,11 +554,25 @@ namespace ICSharpCode.AvalonEdit.XmlParser public enum RawTextType { + /// Ends with non-whitespace WhiteSpace, + + /// Ends with "<"; "]]>" is error CharacterData, + + /// Ends with "-->"; "--" is error Comment, + + /// Ends with "]]>" CData, - DocumentTypeDefinition, + + /// Ends with "?>" + ProcessingInstruction, + + /// Ends with "<" or ">" + UnknownBang, + + /// Unknown Other } diff --git a/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs b/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs index 6f3e22cc14..468b041f5e 100644 --- a/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs +++ b/src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs @@ -59,7 +59,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser /// " | "/>") /// End tag: "" - /// P.instr.: "" + /// P.instr.: "" /// Comment: "" /// CData: "" /// DTD: "" (DOCTYPE or other DTD names) @@ -74,6 +74,17 @@ namespace ICSharpCode.AvalonEdit.XmlParser /// /// Note that there can always be multiple consequtive RawText nodes. /// This is to ensure that idividual texts are not too long. + /// + /// XML Spec: http://www.w3.org/TR/xml/ + /// XML EBNF: http://www.jelks.nu/XML/xmlebnf.html + /// + /// Internals: + /// + /// "Try" methods can silently fail by returning false. + /// MoveTo methods do not move if they are already at the given target + /// If methods return some object, it must be no-empty. It is up to the caller to ensure + /// the context is appropriate for reading. + /// /// public class XmlParser { @@ -163,6 +174,10 @@ namespace ICSharpCode.AvalonEdit.XmlParser void OnParsed(RawObject obj) { + if (obj.Length == 0 && !(obj is RawDocument)) { + throw new Exception(string.Format("Could not parse {0}. It has zero length.", obj)); + } + // TODO: Record touched memory parsedItems.Add(obj); System.Diagnostics.Debug.WriteLine("XML Parser: Parsed " + obj.ToString()); } @@ -222,15 +237,6 @@ namespace ICSharpCode.AvalonEdit.XmlParser } } - // The methods start with 'try' to make it clear they can silently fail. - // Read methods without 'try' have to succed or throw exception. - // - // For example: - // while(true) TryMoveNext(); is obviously infinite loop - // whereas - // while(true) MoveNext(); should eventulay throw exception (if MoveNext it existed) - // - bool TryMoveNext() { if (currentLocation == readingEnd) return false; @@ -330,7 +336,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser } static char[] WhiteSpaceChars = new char[] {' ', '\n', '\r', '\t'}; - static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '<', '=', '>', '/', '?'}; + static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '=', '\'', '"', '<', '>', '/', '?'}; bool TryPeekWhiteSpace() { @@ -339,20 +345,41 @@ namespace ICSharpCode.AvalonEdit.XmlParser return WhiteSpaceChars.Contains(input[currentLocation]); } - string ReadName() + + /// + /// Read a name token. + /// The following characters are not allowed: + /// "" End of file + /// " \n\r\t" Whitesapce + /// "=\'\"" Attribute value + /// "<" Openning Tag + /// ">/?" Closing Tag + /// + bool TryReadName(out string res) { AssertHasMoreData(); int start = currentLocation; TryMoveToAnyOf(WhiteSpaceAndReservedChars.ToArray()); - return GetText(start, currentLocation); + if (start == currentLocation) { + res = null; + return false; + } else { + res = GetText(start, currentLocation); + // TODO: Check that it is valid XML name + return true; + } } + /// + /// Context: any + /// RawDocument ReadDocument() { RawDocument doc; if (TryReadFromCacheOrNew(out doc)) return doc; + // TODO: Errors in document structure doc.StartOffset = currentLocation; while(true) { if (IsEndOfFile()) { @@ -369,6 +396,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser return doc; } + /// + /// Context: "<" + /// RawObject ReadElementOrTag() { AssertHasMoreData(); @@ -382,6 +412,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser } } + /// + /// Context: "<" + /// RawElement ReadElement() { AssertHasMoreData(); @@ -393,7 +426,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser // Read start tag element.AddChild(ReadTag()); Debug.Assert(element.StartTag.IsStartTag); - // Read content and end tag + // Read content and end tag (only if properly closed) if (element.StartTag.ClosingBracket == ">") { while(true) { if (IsEndOfFile()) { @@ -408,11 +441,18 @@ namespace ICSharpCode.AvalonEdit.XmlParser } } element.EndOffset = currentLocation; + // TODO: Closing tag matches + // TODO: Heuristic on closing + + // TODO: ERROR - attribute name may not apper multiple times OnParsed(element); return element; } + /// + /// Context: "<" + /// RawTag ReadTag() { AssertHasMoreData(); @@ -426,27 +466,25 @@ namespace ICSharpCode.AvalonEdit.XmlParser // It identifies the type of tag and parsing behavior for the rest of it tag.OpeningBracket = ReadOpeningBracket(); - if (tag.IsStartTag || tag.IsEndTag || tag.IsProcessingInstruction) { + if (tag.IsStartTag || tag.IsEndTag) { // Read the name - if (HasMoreData()) { - tag.Name = ReadName(); - } + string name; + if (TryReadName(out name)) tag.Name = name; + // TODO: Error - bad name + // TODO: Error - no name? + + // TODO: Error - = or " or ' not expected + // Read attributes for the tag - while(true) { - if (TryPeekWhiteSpace()) { - tag.AddChildren(ReadText(RawTextType.WhiteSpace)); - } - string bracket; - if (TryReadClosingBracket(out bracket)) { - tag.ClosingBracket = bracket; - break; - } + while(true) { + // Chech for all forbiden 'name' charcters first - see ReadName + if (IsEndOfFile()) break; + if (TryPeekWhiteSpace()) tag.AddChildren(ReadText(RawTextType.WhiteSpace)); if (TryPeek('<')) break; - if (HasMoreData()) { - tag.AddChild(ReadAttribulte()); - continue; - } - break; // End of file + if (TryPeek('>') || TryPeek('/') || TryPeek('?')) break; // End tag + + // We have "=\'\"" or name - read attribute + tag.AddChild(ReadAttribulte()); } } else if (tag.IsComment) { // TODO: Backtrack if file end reached @@ -454,22 +492,27 @@ namespace ICSharpCode.AvalonEdit.XmlParser } else if (tag.IsCData) { // TODO: Backtrack if file end reached tag.AddChildren(ReadText(RawTextType.CData)); + } else if (tag.IsProcessingInstruction) { + string name; + if (TryReadName(out name)) tag.Name = name; + // TODO: Error - bad name + // TODO: Error - no name? + // TODO: Backtrack if file end reached + tag.AddChildren(ReadText(RawTextType.ProcessingInstruction)); + } else if (tag.IsUnknownBang) { + // TODO: Backtack if '<' (or end of file) + tag.AddChildren(ReadText(RawTextType.UnknownBang)); } else if (tag.IsDocumentType) { tag.AddChildren(ReadContentOfDTD()); - } else if (tag.IsUnknownBang) { - if (HasMoreData()) { - int start = currentLocation; - TryMoveToAnyOf('<', '>'); - tag.AddChild(MakeText(start, currentLocation)); - } } else { throw new Exception(string.Format("Unknown opening bracket '{0}'", tag.OpeningBracket)); } - if (tag.ClosingBracket == null) { - string bracket; - if (TryReadClosingBracket(out bracket)) tag.ClosingBracket = bracket; - } + // Read closing bracket + string bracket; + if (TryReadClosingBracket(out bracket)) tag.ClosingBracket = bracket; + // TODO: else ERROR - Missing closing bracket + // TODO: check correct closing bracket (special case if end of file) tag.EndOffset = currentLocation; @@ -478,8 +521,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser } /// - /// Reads any of the know opening brackets - /// Also accepts them if they are incomplete; one charater is suffcient + /// Reads any of the know opening brackets. (only full bracket) + /// Context: "<" /// string ReadOpeningBracket() { @@ -500,6 +543,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser // the dtdName includes " - /// Reads any of the know closing brackets - /// Also accepts them if they are incomplete; one charater is suffcient + /// Reads any of the know closing brackets. (only full bracket) + /// Context: any /// bool TryReadClosingBracket(out string bracket) { + // TODO: Touched memory // We are using a lot of string literals so that the memory instances are shared int start = currentLocation; if (TryRead('>')) { bracket = ">"; - } else if (TryRead('/')) { - if (TryRead('>')) { - bracket = "/>"; - } else { - bracket = "/"; - } - } else if (TryRead('?')) { - if (TryRead('>')) { - bracket = "?>"; - } else { - bracket = "?"; - } - } else if (TryReadPartOf("-->")) { - bracket = GetText(start, currentLocation); - } else if (TryReadPartOf("]]>")) { - bracket = GetText(start, currentLocation); + } else if (TryRead("/>")) { + bracket = "/>"; + } else if (TryRead("?>")) { + bracket = "?>"; + } else if (TryRead("-->")) { + bracket = "-->"; + } else if (TryRead("]]>")) { + bracket = "]]>"; } else { bracket = null; return false; @@ -557,7 +594,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser if (IsEndOfFile()) break; TryMoveToAnyOf('<', ']'); if (TryPeek('<')) { - yield return MakeText(start, currentLocation); + if (start != currentLocation) { // Two following tags + yield return MakeText(start, currentLocation); + } yield return ReadTag(); start = currentLocation; } @@ -574,6 +613,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser } } + /// + /// Context: name or "=\'\"" + /// RawAttribute ReadAttribulte() { AssertHasMoreData(); @@ -582,26 +624,45 @@ namespace ICSharpCode.AvalonEdit.XmlParser if (TryReadFromCacheOrNew(out attr)) return attr; attr.StartOffset = currentLocation; - if (HasMoreData()) attr.Name = ReadName(); + + // Read name + string name; + if (TryReadName(out name)) attr.Name = name; + // TODO: else ERROR - attribute name expected + + // Read equals sign and surrounding whitespace int checkpoint = currentLocation; TryMoveToNonWhiteSpace(); if (TryRead('=')) { TryMoveToNonWhiteSpace(); - attr.EqualsSign += GetText(checkpoint, currentLocation); - // Read attribute value - int start = currentLocation; - if (TryRead('"')) { - TryMoveToAnyOf('"', '<'); - TryRead('"'); - attr.Value = GetText(start, currentLocation); - } else if (TryRead('\'')) { - TryMoveToAnyOf('\'', '<'); - TryRead('\''); - attr.Value = GetText(start, currentLocation); - } + attr.EqualsSign = GetText(checkpoint, currentLocation); } else { + // TODO: Track touched memory currentLocation = checkpoint; + // TODO: ERROR - Equals expected + } + + // Read attribute value + int start = currentLocation; + if (TryRead('"')) { + TryMoveToAnyOf('"', '<'); + TryRead('"'); + // TODO: Some backtracking? + // TODO: ERROR - Attribute value not closed + attr.Value = GetText(start, currentLocation); + } else if (TryRead('\'')) { + TryMoveToAnyOf('\'', '<'); + TryRead('\''); + // TODO: Some backtracking? + // TODO: ERROR - Attribute value not closed + attr.Value = GetText(start, currentLocation); + } else { + // TODO: ERROR - Attribute value expected } + + // TODO: Heuristic for missing " or ' + // TODO: Normalize attribute values + attr.EndOffset = currentLocation; OnParsed(attr); @@ -614,11 +675,14 @@ namespace ICSharpCode.AvalonEdit.XmlParser StartOffset = start, EndOffset = end, Value = GetText(start, end), + Type = RawTextType.Other }; + + OnParsed(text); return text; } - const int maxEntityLenght = 12; // 6 for build-in ones + const int maxEntityLenght = 12; // The longest build-in one is 10 ("􏿿") const int maxTextFragmentSize = 8; const int lookAheadLenght = (3 * maxTextFragmentSize) / 2; const int backtrackLenght = 4; // 2: get back over "]]" 1: so that we have some data 1: safety @@ -629,6 +693,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser /// IEnumerable ReadText(RawTextType type) { + // TODO: Rewrite + bool lookahead = false; while(true) { RawText text; @@ -664,13 +730,17 @@ namespace ICSharpCode.AvalonEdit.XmlParser if (type == RawTextType.WhiteSpace) { TryMoveToNonWhiteSpace(); } else if (type == RawTextType.CharacterData) { + // TODO: "]]>" is error TryMoveTo('<'); } else if (type == RawTextType.Comment) { - TryMoveTo("--"); - } else if (type == RawTextType.DocumentTypeDefinition) { - TryMoveTo('>'); + // TODO: "--" is error + TryMoveTo("-->"); } else if (type == RawTextType.CData) { TryMoveTo("]]>"); + } else if (type == RawTextType.ProcessingInstruction) { + TryMoveTo("?>"); + } else if (type == RawTextType.UnknownBang) { + TryMoveToAnyOf('<', '>'); } else { throw new Exception("Uknown type " + type); } @@ -689,7 +759,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser // If there is entity reference, make sure the next segment starts with it to prevent framentation int entitySearchStart = Math.Max(start + 1 /* data for us */, backtrack - maxEntityLenght); - int entityIndex = input.LastIndexOf('&', entitySearchStart, backtrack - entitySearchStart); + // Note that LastIndexOf works backward + int entityIndex = input.LastIndexOf('&', backtrack, backtrack - entitySearchStart); if (entityIndex != -1) { backtrack = entityIndex; }