Browse Source

XML Parser: Rechecked all parse functions and tried to make them more error prone; Added a lot TODOs for the missing functionality.

git-svn-id: svn://svn.sharpdevelop.net/sharpdevelop/trunk@4604 1ccf3a8d-04fe-1044-b7c0-cef0b8235c61
shortcuts
David Srbecký 16 years ago
parent
commit
b39e888905
  1. 34
      src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs
  2. 233
      src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs

34
src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs

@ -8,14 +8,13 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Collections.ObjectModel; using System.Collections.ObjectModel;
using System.Collections.Specialized;
using System.Linq; using System.Linq;
using System.Xml; using System.Xml;
using System.Xml.Linq; using System.Xml.Linq;
using ICSharpCode.AvalonEdit.Document; using ICSharpCode.AvalonEdit.Document;
// Missing XML comment // TODO: Missing XML comment
#pragma warning disable 1591 #pragma warning disable 1591
namespace ICSharpCode.AvalonEdit.XmlParser namespace ICSharpCode.AvalonEdit.XmlParser
@ -106,13 +105,15 @@ namespace ICSharpCode.AvalonEdit.XmlParser
protected XName EncodeXName(string name) protected XName EncodeXName(string name)
{ {
if (string.IsNullOrEmpty(name)) name = "_";
string namesapce = string.Empty; string namesapce = string.Empty;
int colonIndex = name.IndexOf(':'); int colonIndex = name.IndexOf(':');
if (colonIndex != -1) { if (colonIndex != -1) {
namesapce = name.Substring(0, colonIndex); namesapce = name.Substring(0, colonIndex);
name = name.Substring(colonIndex + 1); name = name.Substring(colonIndex + 1);
} }
if (string.IsNullOrEmpty(name)) name = "_";
name = XmlConvert.EncodeLocalName(name); name = XmlConvert.EncodeLocalName(name);
namesapce = XmlConvert.EncodeLocalName(namesapce); namesapce = XmlConvert.EncodeLocalName(namesapce);
return XName.Get(name, namesapce); return XName.Get(name, namesapce);
@ -452,9 +453,12 @@ namespace ICSharpCode.AvalonEdit.XmlParser
{ {
if (!firstUpdate) LogLinq("Updating XElement Attributes of '{0}'", this.StartTag.Name); if (!firstUpdate) LogLinq("Updating XElement Attributes of '{0}'", this.StartTag.Name);
xElem.ReplaceAttributes(); // Otherwise we get duplicate item exception // TODO: Investigate null
XAttribute[] attrs = this.StartTag.Children.OfType<RawAttribute>().Select(x => x.GetXAttribute()).Distinct(new AttributeNameComparer()).ToArray(); if (xElem != null) {
xElem.ReplaceAttributes(attrs); xElem.ReplaceAttributes(); // Otherwise we get duplicate item exception
XAttribute[] attrs = this.StartTag.Children.OfType<RawAttribute>().Select(x => x.GetXAttribute()).Distinct(new AttributeNameComparer()).ToArray();
xElem.ReplaceAttributes(attrs);
}
} }
void UpdateXElementChildren(bool firstUpdate) void UpdateXElementChildren(bool firstUpdate)
@ -494,6 +498,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
public string EqualsSign { get; set; } public string EqualsSign { get; set; }
public string Value { get; set; } public string Value { get; set; }
// TODO: Provide method to dereference Value - &
public override void UpdateDataFrom(RawObject source) public override void UpdateDataFrom(RawObject source)
{ {
if (this.ReadCallID == source.ReadCallID) return; if (this.ReadCallID == source.ReadCallID) return;
@ -548,11 +554,25 @@ namespace ICSharpCode.AvalonEdit.XmlParser
public enum RawTextType public enum RawTextType
{ {
/// <summary> Ends with non-whitespace </summary>
WhiteSpace, WhiteSpace,
/// <summary> Ends with "&lt;"; "]]&gt;" is error </summary>
CharacterData, CharacterData,
/// <summary> Ends with "-->"; "--" is error </summary>
Comment, Comment,
/// <summary> Ends with "]]&gt;" </summary>
CData, CData,
DocumentTypeDefinition,
/// <summary> Ends with "?>" </summary>
ProcessingInstruction,
/// <summary> Ends with "&lt;" or ">" </summary>
UnknownBang,
/// <summary> Unknown </summary>
Other Other
} }

233
src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs

@ -59,7 +59,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
/// <![CDATA[ /// <![CDATA[
/// Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>") /// Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>")
/// End tag: "</" Name? (RawText+ RawAttribute)* RawText* ">" /// End tag: "</" Name? (RawText+ RawAttribute)* RawText* ">"
/// P.instr.: "<?" Name? (RawText+ RawAttribute)* RawText* "?>" /// P.instr.: "<?" Name? (RawText)* "?>"
/// Comment: "<!--" (RawText)* "-->" /// Comment: "<!--" (RawText)* "-->"
/// CData: "<![CDATA[" (RawText)* "]]" ">" /// CData: "<![CDATA[" (RawText)* "]]" ">"
/// DTD: "<!DOCTYPE" (RawText+ RawTag)* RawText* ">" (DOCTYPE or other DTD names) /// DTD: "<!DOCTYPE" (RawText+ RawTag)* RawText* ">" (DOCTYPE or other DTD names)
@ -74,6 +74,17 @@ namespace ICSharpCode.AvalonEdit.XmlParser
/// ///
/// Note that there can always be multiple consequtive RawText nodes. /// Note that there can always be multiple consequtive RawText nodes.
/// This is to ensure that idividual texts are not too long. /// This is to ensure that idividual texts are not too long.
///
/// XML Spec: http://www.w3.org/TR/xml/
/// XML EBNF: http://www.jelks.nu/XML/xmlebnf.html
///
/// Internals:
///
/// "Try" methods can silently fail by returning false.
/// MoveTo methods do not move if they are already at the given target
/// If methods return some object, it must be no-empty. It is up to the caller to ensure
/// the context is appropriate for reading.
///
/// </remarks> /// </remarks>
public class XmlParser public class XmlParser
{ {
@ -163,6 +174,10 @@ namespace ICSharpCode.AvalonEdit.XmlParser
void OnParsed(RawObject obj) void OnParsed(RawObject obj)
{ {
if (obj.Length == 0 && !(obj is RawDocument)) {
throw new Exception(string.Format("Could not parse {0}. It has zero length.", obj));
}
// TODO: Record touched memory
parsedItems.Add(obj); parsedItems.Add(obj);
System.Diagnostics.Debug.WriteLine("XML Parser: Parsed " + obj.ToString()); System.Diagnostics.Debug.WriteLine("XML Parser: Parsed " + obj.ToString());
} }
@ -222,15 +237,6 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
} }
// The methods start with 'try' to make it clear they can silently fail.
// Read methods without 'try' have to succed or throw exception.
//
// For example:
// while(true) TryMoveNext(); is obviously infinite loop
// whereas
// while(true) MoveNext(); should eventulay throw exception (if MoveNext it existed)
//
bool TryMoveNext() bool TryMoveNext()
{ {
if (currentLocation == readingEnd) return false; if (currentLocation == readingEnd) return false;
@ -330,7 +336,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
static char[] WhiteSpaceChars = new char[] {' ', '\n', '\r', '\t'}; static char[] WhiteSpaceChars = new char[] {' ', '\n', '\r', '\t'};
static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '<', '=', '>', '/', '?'}; static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '=', '\'', '"', '<', '>', '/', '?'};
bool TryPeekWhiteSpace() bool TryPeekWhiteSpace()
{ {
@ -339,20 +345,41 @@ namespace ICSharpCode.AvalonEdit.XmlParser
return WhiteSpaceChars.Contains(input[currentLocation]); return WhiteSpaceChars.Contains(input[currentLocation]);
} }
string ReadName()
/// <summary>
/// Read a name token.
/// The following characters are not allowed:
/// "" End of file
/// " \n\r\t" Whitesapce
/// "=\'\"" Attribute value
/// "&lt;" Openning Tag
/// ">/?" Closing Tag
/// </summary>
bool TryReadName(out string res)
{ {
AssertHasMoreData(); AssertHasMoreData();
int start = currentLocation; int start = currentLocation;
TryMoveToAnyOf(WhiteSpaceAndReservedChars.ToArray()); TryMoveToAnyOf(WhiteSpaceAndReservedChars.ToArray());
return GetText(start, currentLocation); if (start == currentLocation) {
res = null;
return false;
} else {
res = GetText(start, currentLocation);
// TODO: Check that it is valid XML name
return true;
}
} }
/// <summary>
/// Context: any
/// </summary>
RawDocument ReadDocument() RawDocument ReadDocument()
{ {
RawDocument doc; RawDocument doc;
if (TryReadFromCacheOrNew(out doc)) return doc; if (TryReadFromCacheOrNew(out doc)) return doc;
// TODO: Errors in document structure
doc.StartOffset = currentLocation; doc.StartOffset = currentLocation;
while(true) { while(true) {
if (IsEndOfFile()) { if (IsEndOfFile()) {
@ -369,6 +396,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser
return doc; return doc;
} }
/// <summary>
/// Context: "&lt;"
/// </summary>
RawObject ReadElementOrTag() RawObject ReadElementOrTag()
{ {
AssertHasMoreData(); AssertHasMoreData();
@ -382,6 +412,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
} }
/// <summary>
/// Context: "&lt;"
/// </summary>
RawElement ReadElement() RawElement ReadElement()
{ {
AssertHasMoreData(); AssertHasMoreData();
@ -393,7 +426,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// Read start tag // Read start tag
element.AddChild(ReadTag()); element.AddChild(ReadTag());
Debug.Assert(element.StartTag.IsStartTag); Debug.Assert(element.StartTag.IsStartTag);
// Read content and end tag // Read content and end tag (only if properly closed)
if (element.StartTag.ClosingBracket == ">") { if (element.StartTag.ClosingBracket == ">") {
while(true) { while(true) {
if (IsEndOfFile()) { if (IsEndOfFile()) {
@ -408,11 +441,18 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
} }
element.EndOffset = currentLocation; element.EndOffset = currentLocation;
// TODO: Closing tag matches
// TODO: Heuristic on closing
// TODO: ERROR - attribute name may not apper multiple times
OnParsed(element); OnParsed(element);
return element; return element;
} }
/// <summary>
/// Context: "&lt;"
/// </summary>
RawTag ReadTag() RawTag ReadTag()
{ {
AssertHasMoreData(); AssertHasMoreData();
@ -426,27 +466,25 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// It identifies the type of tag and parsing behavior for the rest of it // It identifies the type of tag and parsing behavior for the rest of it
tag.OpeningBracket = ReadOpeningBracket(); tag.OpeningBracket = ReadOpeningBracket();
if (tag.IsStartTag || tag.IsEndTag || tag.IsProcessingInstruction) { if (tag.IsStartTag || tag.IsEndTag) {
// Read the name // Read the name
if (HasMoreData()) { string name;
tag.Name = ReadName(); if (TryReadName(out name)) tag.Name = name;
} // TODO: Error - bad name
// TODO: Error - no name?
// TODO: Error - = or " or ' not expected
// Read attributes for the tag // Read attributes for the tag
while(true) { while(true) {
if (TryPeekWhiteSpace()) { // Chech for all forbiden 'name' charcters first - see ReadName
tag.AddChildren(ReadText(RawTextType.WhiteSpace)); if (IsEndOfFile()) break;
} if (TryPeekWhiteSpace()) tag.AddChildren(ReadText(RawTextType.WhiteSpace));
string bracket;
if (TryReadClosingBracket(out bracket)) {
tag.ClosingBracket = bracket;
break;
}
if (TryPeek('<')) break; if (TryPeek('<')) break;
if (HasMoreData()) { if (TryPeek('>') || TryPeek('/') || TryPeek('?')) break; // End tag
tag.AddChild(ReadAttribulte());
continue; // We have "=\'\"" or name - read attribute
} tag.AddChild(ReadAttribulte());
break; // End of file
} }
} else if (tag.IsComment) { } else if (tag.IsComment) {
// TODO: Backtrack if file end reached // TODO: Backtrack if file end reached
@ -454,22 +492,27 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} else if (tag.IsCData) { } else if (tag.IsCData) {
// TODO: Backtrack if file end reached // TODO: Backtrack if file end reached
tag.AddChildren(ReadText(RawTextType.CData)); tag.AddChildren(ReadText(RawTextType.CData));
} else if (tag.IsProcessingInstruction) {
string name;
if (TryReadName(out name)) tag.Name = name;
// TODO: Error - bad name
// TODO: Error - no name?
// TODO: Backtrack if file end reached
tag.AddChildren(ReadText(RawTextType.ProcessingInstruction));
} else if (tag.IsUnknownBang) {
// TODO: Backtack if '<' (or end of file)
tag.AddChildren(ReadText(RawTextType.UnknownBang));
} else if (tag.IsDocumentType) { } else if (tag.IsDocumentType) {
tag.AddChildren(ReadContentOfDTD()); tag.AddChildren(ReadContentOfDTD());
} else if (tag.IsUnknownBang) {
if (HasMoreData()) {
int start = currentLocation;
TryMoveToAnyOf('<', '>');
tag.AddChild(MakeText(start, currentLocation));
}
} else { } else {
throw new Exception(string.Format("Unknown opening bracket '{0}'", tag.OpeningBracket)); throw new Exception(string.Format("Unknown opening bracket '{0}'", tag.OpeningBracket));
} }
if (tag.ClosingBracket == null) { // Read closing bracket
string bracket; string bracket;
if (TryReadClosingBracket(out bracket)) tag.ClosingBracket = bracket; if (TryReadClosingBracket(out bracket)) tag.ClosingBracket = bracket;
} // TODO: else ERROR - Missing closing bracket
// TODO: check correct closing bracket (special case if end of file)
tag.EndOffset = currentLocation; tag.EndOffset = currentLocation;
@ -478,8 +521,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
/// <summary> /// <summary>
/// Reads any of the know opening brackets /// Reads any of the know opening brackets. (only full bracket)
/// Also accepts them if they are incomplete; one charater is suffcient /// Context: "&lt;"
/// </summary> /// </summary>
string ReadOpeningBracket() string ReadOpeningBracket()
{ {
@ -500,6 +543,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// the dtdName includes "<!" // the dtdName includes "<!"
if (TryRead(dtdName.Remove(0, 2))) return dtdName; if (TryRead(dtdName.Remove(0, 2))) return dtdName;
} }
// TODO: Error - unkown bang tag
return "<!"; return "<!";
} }
} else { } else {
@ -511,31 +555,24 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
/// <summary> /// <summary>
/// Reads any of the know closing brackets /// Reads any of the know closing brackets. (only full bracket)
/// Also accepts them if they are incomplete; one charater is suffcient /// Context: any
/// </summary> /// </summary>
bool TryReadClosingBracket(out string bracket) bool TryReadClosingBracket(out string bracket)
{ {
// TODO: Touched memory
// We are using a lot of string literals so that the memory instances are shared // We are using a lot of string literals so that the memory instances are shared
int start = currentLocation; int start = currentLocation;
if (TryRead('>')) { if (TryRead('>')) {
bracket = ">"; bracket = ">";
} else if (TryRead('/')) { } else if (TryRead("/>")) {
if (TryRead('>')) { bracket = "/>";
bracket = "/>"; } else if (TryRead("?>")) {
} else { bracket = "?>";
bracket = "/"; } else if (TryRead("-->")) {
} bracket = "-->";
} else if (TryRead('?')) { } else if (TryRead("]]>")) {
if (TryRead('>')) { bracket = "]]>";
bracket = "?>";
} else {
bracket = "?";
}
} else if (TryReadPartOf("-->")) {
bracket = GetText(start, currentLocation);
} else if (TryReadPartOf("]]>")) {
bracket = GetText(start, currentLocation);
} else { } else {
bracket = null; bracket = null;
return false; return false;
@ -557,7 +594,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser
if (IsEndOfFile()) break; if (IsEndOfFile()) break;
TryMoveToAnyOf('<', ']'); TryMoveToAnyOf('<', ']');
if (TryPeek('<')) { if (TryPeek('<')) {
yield return MakeText(start, currentLocation); if (start != currentLocation) { // Two following tags
yield return MakeText(start, currentLocation);
}
yield return ReadTag(); yield return ReadTag();
start = currentLocation; start = currentLocation;
} }
@ -574,6 +613,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
} }
/// <summary>
/// Context: name or "=\'\""
/// </summary>
RawAttribute ReadAttribulte() RawAttribute ReadAttribulte()
{ {
AssertHasMoreData(); AssertHasMoreData();
@ -582,26 +624,45 @@ namespace ICSharpCode.AvalonEdit.XmlParser
if (TryReadFromCacheOrNew(out attr)) return attr; if (TryReadFromCacheOrNew(out attr)) return attr;
attr.StartOffset = currentLocation; attr.StartOffset = currentLocation;
if (HasMoreData()) attr.Name = ReadName();
// Read name
string name;
if (TryReadName(out name)) attr.Name = name;
// TODO: else ERROR - attribute name expected
// Read equals sign and surrounding whitespace
int checkpoint = currentLocation; int checkpoint = currentLocation;
TryMoveToNonWhiteSpace(); TryMoveToNonWhiteSpace();
if (TryRead('=')) { if (TryRead('=')) {
TryMoveToNonWhiteSpace(); TryMoveToNonWhiteSpace();
attr.EqualsSign += GetText(checkpoint, currentLocation); attr.EqualsSign = GetText(checkpoint, currentLocation);
// Read attribute value
int start = currentLocation;
if (TryRead('"')) {
TryMoveToAnyOf('"', '<');
TryRead('"');
attr.Value = GetText(start, currentLocation);
} else if (TryRead('\'')) {
TryMoveToAnyOf('\'', '<');
TryRead('\'');
attr.Value = GetText(start, currentLocation);
}
} else { } else {
// TODO: Track touched memory
currentLocation = checkpoint; currentLocation = checkpoint;
// TODO: ERROR - Equals expected
}
// Read attribute value
int start = currentLocation;
if (TryRead('"')) {
TryMoveToAnyOf('"', '<');
TryRead('"');
// TODO: Some backtracking?
// TODO: ERROR - Attribute value not closed
attr.Value = GetText(start, currentLocation);
} else if (TryRead('\'')) {
TryMoveToAnyOf('\'', '<');
TryRead('\'');
// TODO: Some backtracking?
// TODO: ERROR - Attribute value not closed
attr.Value = GetText(start, currentLocation);
} else {
// TODO: ERROR - Attribute value expected
} }
// TODO: Heuristic for missing " or '
// TODO: Normalize attribute values
attr.EndOffset = currentLocation; attr.EndOffset = currentLocation;
OnParsed(attr); OnParsed(attr);
@ -614,11 +675,14 @@ namespace ICSharpCode.AvalonEdit.XmlParser
StartOffset = start, StartOffset = start,
EndOffset = end, EndOffset = end,
Value = GetText(start, end), Value = GetText(start, end),
Type = RawTextType.Other
}; };
OnParsed(text);
return text; return text;
} }
const int maxEntityLenght = 12; // 6 for build-in ones const int maxEntityLenght = 12; // The longest build-in one is 10 ("&#x10FFFF;")
const int maxTextFragmentSize = 8; const int maxTextFragmentSize = 8;
const int lookAheadLenght = (3 * maxTextFragmentSize) / 2; const int lookAheadLenght = (3 * maxTextFragmentSize) / 2;
const int backtrackLenght = 4; // 2: get back over "]]" 1: so that we have some data 1: safety const int backtrackLenght = 4; // 2: get back over "]]" 1: so that we have some data 1: safety
@ -629,6 +693,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
/// </summary> /// </summary>
IEnumerable<RawObject> ReadText(RawTextType type) IEnumerable<RawObject> ReadText(RawTextType type)
{ {
// TODO: Rewrite
bool lookahead = false; bool lookahead = false;
while(true) { while(true) {
RawText text; RawText text;
@ -664,13 +730,17 @@ namespace ICSharpCode.AvalonEdit.XmlParser
if (type == RawTextType.WhiteSpace) { if (type == RawTextType.WhiteSpace) {
TryMoveToNonWhiteSpace(); TryMoveToNonWhiteSpace();
} else if (type == RawTextType.CharacterData) { } else if (type == RawTextType.CharacterData) {
// TODO: "]]>" is error
TryMoveTo('<'); TryMoveTo('<');
} else if (type == RawTextType.Comment) { } else if (type == RawTextType.Comment) {
TryMoveTo("--"); // TODO: "--" is error
} else if (type == RawTextType.DocumentTypeDefinition) { TryMoveTo("-->");
TryMoveTo('>');
} else if (type == RawTextType.CData) { } else if (type == RawTextType.CData) {
TryMoveTo("]]>"); TryMoveTo("]]>");
} else if (type == RawTextType.ProcessingInstruction) {
TryMoveTo("?>");
} else if (type == RawTextType.UnknownBang) {
TryMoveToAnyOf('<', '>');
} else { } else {
throw new Exception("Uknown type " + type); throw new Exception("Uknown type " + type);
} }
@ -689,7 +759,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// If there is entity reference, make sure the next segment starts with it to prevent framentation // If there is entity reference, make sure the next segment starts with it to prevent framentation
int entitySearchStart = Math.Max(start + 1 /* data for us */, backtrack - maxEntityLenght); int entitySearchStart = Math.Max(start + 1 /* data for us */, backtrack - maxEntityLenght);
int entityIndex = input.LastIndexOf('&', entitySearchStart, backtrack - entitySearchStart); // Note that LastIndexOf works backward
int entityIndex = input.LastIndexOf('&', backtrack, backtrack - entitySearchStart);
if (entityIndex != -1) { if (entityIndex != -1) {
backtrack = entityIndex; backtrack = entityIndex;
} }

Loading…
Cancel
Save