You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
630 lines
17 KiB
630 lines
17 KiB
// <file> |
|
// <copyright see="prj:///doc/copyright.txt"/> |
|
// <license see="prj:///doc/license.txt"/> |
|
// <owner name="David Srbecký" email="dsrbecky@gmail.com"/> |
|
// <version>$Revision$</version> |
|
// </file> |
|
|
|
using System; |
|
using System.Collections.Generic; |
|
using System.Diagnostics; |
|
using System.Linq; |
|
using System.Xml.Linq; |
|
|
|
using ICSharpCode.AvalonEdit.Document; |
|
|
|
namespace ICSharpCode.AvalonEdit.XmlParser |
|
{ |
|
/// <summary> |
|
/// Creates object tree from XML document. |
|
/// </summary> |
|
/// <remarks> |
|
/// The created tree fully describes the document and thus the orginal XML file can be |
|
/// exactly reproduced. |
|
/// |
|
/// Any further parses will reparse only the changed parts and the existing three will |
|
/// be updated with the changes. The user can add event handlers to be notified of |
|
/// the changes. The parser tries to minimize the number of changes to the tree. |
|
/// (for example, it will add a single child at the start of collection rather than |
|
/// clearing the collection and adding new children) |
|
/// |
|
/// The object tree consists of following types: |
|
/// RawObject - Abstact base class for all types |
|
/// RawContainer - Abstact base class for all types that can contain child nodes |
|
/// RawDocument - The root object of the XML document |
|
/// RawElement - Logical grouping of other nodes together. The first child is always the start tag. |
|
/// RawTag - Represents any markup starting with "<" and (hopefully) ending with ">" |
|
/// RawAttribute - Name-value pair in a tag |
|
/// RawText - Whitespace or character data |
|
/// |
|
/// For example, see the following XML and the produced object tree: |
|
/// <![CDATA[ |
|
/// <!-- My favourite quote --> |
|
/// <quote author="Albert Einstein"> |
|
/// Make everything as simple as possible, but not simpler. |
|
/// </quote> |
|
/// |
|
/// RawDocument |
|
/// RawTag "<!--" "-->" |
|
/// RawText " My favourite quote " |
|
/// RawElement |
|
/// RawTag "<" "quote" ">" |
|
/// RawText " " |
|
/// RawAttribute 'author="Albert Einstein"' |
|
/// RawText "\n Make everything as simple as possible, but not simpler.\n" |
|
/// RawTag "</" "quote" ">" |
|
/// ]]> |
|
/// |
|
/// The precise content of RawTag depends on what it represents: |
|
/// <![CDATA[ |
|
/// Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>") |
|
/// End tag: "</" Name? (RawText+ RawAttribute)* RawText* ">" |
|
/// P.instr.: "<?" Name? (RawText+ RawAttribute)* RawText* "?>" |
|
/// Comment: "<!" partof("--")? (RawText)* "-->" (Name is always null) |
|
/// DTD: "<!" partof("DOCTYPE") (RawText)* ">" (Name is always null) |
|
/// CData: "<!" partof("[CDATA[") (RawText)* "]]" ">" (Name is always null) |
|
/// ]]> |
|
/// |
|
/// The type of tag can be identified by the opening backet. |
|
/// There are helpper properties in the RawTag class to identify the type, exactly |
|
/// one of the properties will be true. |
|
/// |
|
/// The closing bracket may be missing or may be different for mallformed XML. |
|
/// |
|
/// Note that there can always be multiple consequtive RawText nodes. |
|
/// This is to ensure that idividual texts are not too long. |
|
/// </remarks> |
|
public class XmlParser |
|
{ |
|
RawDocument userDocument = new RawDocument(); |
|
XDocument userLinqDocument; |
|
TextDocument textDocument; |
|
TextSegmentCollection<RawObject> parsedItems = new TextSegmentCollection<RawObject>(); |
|
List<DocumentChangeEventArgs> changesSinceLastParse = new List<DocumentChangeEventArgs>(); |
|
|
|
/// <summary> |
|
/// Create new parser, but do not parse the text yet. |
|
/// </summary> |
|
public XmlParser(TextDocument textDocument) |
|
{ |
|
this.userLinqDocument = userDocument.GetXDocument(); |
|
this.textDocument = textDocument; |
|
this.textDocument.Changed += delegate(object sender, DocumentChangeEventArgs e) { |
|
changesSinceLastParse.Add(e); |
|
}; |
|
} |
|
|
|
/// <summary> |
|
/// Incrementaly parse the document |
|
/// </summary> |
|
public RawDocument Parse() |
|
{ |
|
currentLocation = 0; |
|
input = textDocument.Text; |
|
|
|
foreach(DocumentChangeEventArgs change in changesSinceLastParse) { |
|
// Update offsets of all items |
|
parsedItems.UpdateOffsets(change); |
|
// Remove any items affected by the change |
|
int start = change.Offset - 2; |
|
int end = change.Offset + change.InsertionLength + 2; |
|
start = Math.Max(Math.Min(start, textDocument.TextLength - 1), 0); |
|
end = Math.Max(Math.Min(end, textDocument.TextLength - 1), 0); |
|
foreach(RawObject obj in parsedItems.FindOverlappingSegments(start, end - start)) { |
|
parsedItems.Remove(obj); |
|
Log("Removed cached item {0}", obj); |
|
} |
|
} |
|
changesSinceLastParse.Clear(); |
|
|
|
RawDocument parsedDocument = ReadDocument(); |
|
// Just in case parse method was called redundantly |
|
if (parsedDocument.ReadCallID != userDocument.ReadCallID) { |
|
PrintStringCacheStats(); |
|
RawObject.LogDom("Updating main DOM tree..."); |
|
} |
|
userDocument.UpdateDataFrom(parsedDocument); |
|
return userDocument; |
|
} |
|
|
|
T ReadFromCache<T>(int location) where T: RawObject |
|
{ |
|
RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(location); |
|
while(obj != null && obj.StartOffset == location) { |
|
if (obj is T) { |
|
currentLocation += obj.Length; |
|
return (T)obj; |
|
} |
|
obj = parsedItems.GetNextSegment(obj); |
|
} |
|
return null; |
|
} |
|
|
|
void Log(string text, params object[] pars) |
|
{ |
|
System.Diagnostics.Debug.WriteLine("XML Parser: " + text, pars); |
|
} |
|
|
|
void LogParsed(RawObject obj) |
|
{ |
|
System.Diagnostics.Debug.WriteLine("XML Parser: Parsed " + obj.ToString()); |
|
} |
|
|
|
Dictionary<string, string> stringCache = new Dictionary<string, string>(); |
|
int stringCacheRequestedCount; |
|
int stringCacheRequestedSize; |
|
int stringCacheSavedCount; |
|
int stringCacheSavedSize; |
|
|
|
string GetCachedString(string cached) |
|
{ |
|
stringCacheRequestedCount += 1; |
|
stringCacheRequestedSize += 8 + 2 * cached.Length; |
|
// Do not bother with long strings |
|
//if (cached.Length <= 32) return cached; |
|
if (stringCache.ContainsKey(cached)) { |
|
// Get the instance from the cache instead |
|
stringCacheSavedCount += 1; |
|
stringCacheSavedSize += 8 + 2 * cached.Length; |
|
return stringCache[cached]; |
|
} else { |
|
// Add to cache |
|
stringCache.Add(cached, cached); |
|
return cached; |
|
} |
|
} |
|
|
|
void PrintStringCacheStats() |
|
{ |
|
Log("String cache: Requested {0} ({1} bytes); Saved {2} ({3} bytes); {4}% Saved", stringCacheRequestedCount, stringCacheRequestedSize, stringCacheSavedCount, stringCacheSavedSize, stringCacheRequestedSize == 0 ? 0 : stringCacheSavedSize * 100 / stringCacheRequestedSize); |
|
} |
|
|
|
string input; |
|
int currentLocation; |
|
|
|
bool IsEndOfFile() |
|
{ |
|
return currentLocation == input.Length; |
|
} |
|
|
|
bool HasMoreData() |
|
{ |
|
return currentLocation < input.Length; |
|
} |
|
|
|
void AssertHasMoreData() |
|
{ |
|
if (currentLocation == input.Length) { |
|
throw new Exception("Unexpected end of files"); |
|
} |
|
} |
|
|
|
// The methods start with 'try' to make it clear they can silently fail. |
|
// Read methods without 'try' have to succed or throw exception. |
|
// |
|
// For example: |
|
// while(true) TryMoveNext(); is obviously infinite loop |
|
// whereas |
|
// while(true) MoveNext(); should eventulay throw exception (if MoveNext it existed) |
|
// |
|
|
|
bool TryMoveNext() |
|
{ |
|
if (currentLocation == input.Length) return false; |
|
|
|
currentLocation++; |
|
return true; |
|
} |
|
|
|
bool TryRead(char c) |
|
{ |
|
if (currentLocation == input.Length) return false; |
|
|
|
if (input[currentLocation] == c) { |
|
currentLocation++; |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
bool TryRead(string text) |
|
{ |
|
if (TryPeek(text)) { |
|
currentLocation += text.Length; |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
/// <summary> Returns true if at least one character was read </summary> |
|
bool TryReadPartOf(string text) |
|
{ |
|
if (TryPeek(text[0])) { |
|
// Keep reading until character differs or we have end of file |
|
foreach(char c in text) if (!TryRead(c)) break; |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
bool TryPeek(char c) |
|
{ |
|
if (currentLocation == input.Length) return false; |
|
|
|
return input[currentLocation] == c; |
|
} |
|
|
|
bool TryPeek(string text) |
|
{ |
|
if (currentLocation + text.Length > input.Length) return false; |
|
|
|
return input.Substring(currentLocation, text.Length) == text; |
|
} |
|
|
|
bool TryMoveTo(char c) |
|
{ |
|
while(true) { |
|
if (currentLocation == input.Length) return false; |
|
if (input[currentLocation] == c) return true; |
|
currentLocation++; |
|
} |
|
} |
|
|
|
bool TryMoveToAnyOf(params char[] c) |
|
{ |
|
while(true) { |
|
if (currentLocation == input.Length) return false; |
|
if (c.Contains(input[currentLocation])) return true; |
|
currentLocation++; |
|
} |
|
} |
|
|
|
string GetText(int start, int end) |
|
{ |
|
if (start == input.Length && end == input.Length) { |
|
return string.Empty; |
|
} else { |
|
return GetCachedString(input.Substring(start, end - start)); |
|
} |
|
} |
|
|
|
static char[] WhiteSpaceChars = new char[] {' ', '\n', '\r', '\t'}; |
|
static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '<', '=', '>', '/', '?'}; |
|
|
|
bool TryPeekWhiteSpace() |
|
{ |
|
if (currentLocation == input.Length) return false; |
|
|
|
return WhiteSpaceChars.Contains(input[currentLocation]); |
|
} |
|
|
|
string ReadName() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
int start = currentLocation; |
|
TryMoveToAnyOf(WhiteSpaceAndReservedChars.ToArray()); |
|
return GetText(start, currentLocation); |
|
} |
|
|
|
RawDocument ReadDocument() |
|
{ |
|
RawDocument doc = ReadFromCache<RawDocument>(currentLocation); |
|
if (doc != null) return doc; |
|
|
|
doc = new RawDocument(); |
|
|
|
doc.StartOffset = currentLocation; |
|
while(true) { |
|
if (IsEndOfFile()) { |
|
break; |
|
} else if (TryPeek('<')) { |
|
doc.AddChild(ReadElementOrTag()); |
|
} else { |
|
doc.AddChild(ReadCharacterData()); |
|
} |
|
} |
|
doc.EndOffset = currentLocation; |
|
|
|
LogParsed(doc); |
|
parsedItems.Add(doc); |
|
return doc; |
|
} |
|
|
|
RawObject ReadElementOrTag() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
if (TryPeek("<!") || TryPeek("</") || TryPeek("<?")) { |
|
return ReadTag(); |
|
} else if (TryPeek('<')) { |
|
return ReadElement(); |
|
} else { |
|
throw new Exception("'<' expected"); |
|
} |
|
} |
|
|
|
RawElement ReadElement() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
RawElement element = ReadFromCache<RawElement>(currentLocation); |
|
if (element != null) return element; |
|
|
|
element = new RawElement(); |
|
|
|
element.StartOffset = currentLocation; |
|
// Read start tag |
|
element.AddChild(ReadTag()); |
|
Debug.Assert(element.StartTag.IsStartTag); |
|
// Read content and end tag |
|
if (element.StartTag.ClosingBracket == ">") { |
|
while(true) { |
|
if (IsEndOfFile()) { |
|
break; |
|
} else if (TryPeek('<')) { |
|
RawObject content = ReadElementOrTag(); |
|
if (content is RawTag && ((RawTag)content).IsEndTag) break; |
|
element.AddChild(content); |
|
} else { |
|
element.AddChild(ReadCharacterData()); |
|
} |
|
} |
|
} |
|
element.EndOffset = currentLocation; |
|
|
|
LogParsed(element); |
|
parsedItems.Add(element); |
|
return element; |
|
} |
|
|
|
|
|
// Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>") |
|
// End tag: "</" Name? (RawText+ RawAttribute)* RawText* ">" |
|
// P.instr.: "<?" Name? (RawText+ RawAttribute)* RawText* "?>" |
|
// Comment: "<!" partof("--")? (RawText)* "-->" (Name is always null) |
|
// CData: "<!" partof("[CDATA[") (RawText)* "]]" ">" (Name is always null) |
|
// DTD: "<!" partof("DOCTYPE") (RawText)* ">" (Name is always null) |
|
|
|
RawTag ReadTag() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
RawTag tag = ReadFromCache<RawTag>(currentLocation); |
|
if (tag != null) return tag; |
|
|
|
tag = new RawTag(); |
|
|
|
tag.StartOffset = currentLocation; |
|
|
|
// Read the opening bracket |
|
// It identifies the type of tag and parsing behavior for the rest of it |
|
tag.OpeningBracket = ReadOpeningBracket(); |
|
|
|
// Read the name |
|
if (tag.IsStartTag || tag.IsEndTag || tag.IsProcessingInstruction) { |
|
if (HasMoreData()) { |
|
tag.Name = ReadName(); |
|
} |
|
} |
|
|
|
if (tag.IsStartTag || tag.IsEndTag || tag.IsProcessingInstruction) { |
|
// Read attributes for the tag |
|
while(true) { |
|
if (TryPeekWhiteSpace()) { |
|
tag.AddChild(ReadWhiteSpace()); |
|
} |
|
string bracket; |
|
if (TryReadClosingBracket(out bracket)) { |
|
tag.ClosingBracket = bracket; |
|
break; |
|
} |
|
if (TryPeek('<')) break; |
|
if (HasMoreData()) { |
|
tag.AddChild(ReadAttribulte()); |
|
continue; |
|
} |
|
break; // End of file |
|
} |
|
} else { |
|
// Simple tag types |
|
if (tag.IsComment) { |
|
// TODO: Be strict only if the opening bracket is complete |
|
tag.AddChildren(ReadTextUntil("-->").ToList()); |
|
} else if (tag.IsCData) { |
|
// TODO: Be strict only if the opening bracket is complete |
|
tag.AddChildren(ReadTextUntil("]]>").ToList()); |
|
} else if (tag.IsDocumentType) { |
|
// TODO: Nested definition |
|
tag.AddChildren(ReadTextUntil(">").ToList()); |
|
} |
|
string bracket; |
|
if (TryReadClosingBracket(out bracket)) { |
|
tag.ClosingBracket = bracket; |
|
} |
|
} |
|
tag.EndOffset = currentLocation; |
|
|
|
LogParsed(tag); |
|
parsedItems.Add(tag); |
|
return tag; |
|
} |
|
|
|
/// <summary> |
|
/// Reads any of the know opening brackets |
|
/// Also accepts them if they are incomplete; one charater is suffcient |
|
/// </summary> |
|
string ReadOpeningBracket() |
|
{ |
|
// We are using a lot of string literals so that the memory instances are shared |
|
int start = currentLocation; |
|
if (TryRead('<')) { |
|
if (TryRead('/')) { |
|
return "</"; |
|
} else if (TryRead('!')) { |
|
if (TryRead('-')) { |
|
if (TryRead('-')) { |
|
return "<!--"; |
|
} else { |
|
return "<!-"; |
|
} |
|
} else if (TryReadPartOf("[CDATA[")) { |
|
return GetText(start, currentLocation); |
|
} else if (TryReadPartOf("DOCTYPE")) { |
|
return GetText(start, currentLocation); |
|
} else { |
|
return "<!"; |
|
} |
|
} else if (TryRead('?')) { |
|
return "<?"; |
|
} else { |
|
return "<"; |
|
} |
|
} else { |
|
throw new Exception("'<' expected"); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Reads any of the know closing brackets |
|
/// Also accepts them if they are incomplete; one charater is suffcient |
|
/// </summary> |
|
bool TryReadClosingBracket(out string bracket) |
|
{ |
|
// We are using a lot of string literals so that the memory instances are shared |
|
int start = currentLocation; |
|
if (TryRead('>')) { |
|
bracket = ">"; |
|
} else if (TryRead('/')) { |
|
if (TryRead('>')) { |
|
bracket = "/>"; |
|
} else { |
|
bracket = "/"; |
|
} |
|
} else if (TryRead('?')) { |
|
if (TryRead('>')) { |
|
bracket = "?>"; |
|
} else { |
|
bracket = "?"; |
|
} |
|
} else if (TryReadPartOf("-->")) { |
|
bracket = GetText(start, currentLocation); |
|
} else if (TryReadPartOf("]]>")) { |
|
bracket = GetText(start, currentLocation); |
|
} else { |
|
bracket = null; |
|
return false; |
|
} |
|
return true; |
|
} |
|
|
|
RawAttribute ReadAttribulte() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
RawAttribute attr = ReadFromCache<RawAttribute>(currentLocation); |
|
if (attr != null) return attr; |
|
|
|
attr = new RawAttribute(); |
|
|
|
attr.StartOffset = currentLocation; |
|
if (HasMoreData()) attr.Name = ReadName(); |
|
int checkpoint = currentLocation; |
|
attr.EqualsSign = string.Empty; |
|
if (TryPeekWhiteSpace()) attr.EqualsSign += ReadWhiteSpace().Value; |
|
if (TryRead('=')) { |
|
attr.EqualsSign += "="; |
|
if (TryPeekWhiteSpace()) attr.EqualsSign += ReadWhiteSpace().Value; |
|
// Read attribute value |
|
int start = currentLocation; |
|
if (TryRead('"')) { |
|
TryMoveToAnyOf('"', '<'); |
|
TryRead('"'); |
|
attr.Value = GetText(start, currentLocation); |
|
} else if (TryRead('\'')) { |
|
TryMoveToAnyOf('\'', '<'); |
|
TryRead('\''); |
|
attr.Value = GetText(start, currentLocation); |
|
} |
|
} else { |
|
attr.EqualsSign = null; |
|
currentLocation = checkpoint; |
|
} |
|
attr.EndOffset = currentLocation; |
|
|
|
parsedItems.Add(attr); |
|
return attr; |
|
} |
|
|
|
RawText ReadWhiteSpace() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
RawText ws = ReadFromCache<RawText>(currentLocation); |
|
if (ws != null) return ws; |
|
|
|
ws = new RawText(); |
|
|
|
ws.StartOffset = currentLocation; |
|
int start = currentLocation; |
|
while(TryPeekWhiteSpace()) TryMoveNext(); |
|
ws.Value = GetText(start, currentLocation); |
|
ws.EndOffset = currentLocation; |
|
|
|
Debug.Assert(ws.Value.Length > 0); |
|
|
|
parsedItems.Add(ws); |
|
return ws; |
|
} |
|
|
|
RawText ReadCharacterData() |
|
{ |
|
Debug.Assert(HasMoreData()); |
|
|
|
RawText charData = ReadFromCache<RawText>(currentLocation); |
|
if (charData != null) return charData; |
|
|
|
charData = new RawText(); |
|
|
|
charData.StartOffset = currentLocation; |
|
int start = currentLocation; |
|
TryMoveTo('<'); |
|
charData.Value = GetText(start, currentLocation); |
|
charData.EndOffset = currentLocation; |
|
|
|
Debug.Assert(charData.Value.Length > 0); |
|
|
|
parsedItems.Add(charData); |
|
return charData; |
|
} |
|
|
|
IEnumerable<RawObject> ReadTextUntil(string closingText) |
|
{ |
|
Debug.Assert(HasMoreData()); |
|
|
|
RawText charData = ReadFromCache<RawText>(currentLocation); |
|
// TODO: How many return? Ensure the output is same as before |
|
if (charData != null) yield return charData; |
|
|
|
charData = new RawText(); |
|
|
|
charData.StartOffset = currentLocation; |
|
int start = currentLocation; |
|
while(true) { |
|
if (!TryMoveTo(closingText[0])) break; // End of file |
|
if (TryPeek(closingText)) break; // Match |
|
TryMoveNext(); |
|
} |
|
charData.Value = GetText(start, currentLocation); |
|
charData.EndOffset = currentLocation; |
|
|
|
Debug.Assert(charData.Value.Length > 0); |
|
|
|
parsedItems.Add(charData); |
|
yield return charData; |
|
} |
|
} |
|
}
|
|
|