// // // // // $Revision$ // using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Xml.Linq; using ICSharpCode.AvalonEdit.Document; namespace ICSharpCode.AvalonEdit.XmlParser { /// /// Creates object tree from XML document. /// /// /// The created tree fully describes the document and thus the orginal XML file can be /// exactly reproduced. /// /// Any further parses will reparse only the changed parts and the existing three will /// be updated with the changes. The user can add event handlers to be notified of /// the changes. The parser tries to minimize the number of changes to the tree. /// (for example, it will add a single child at the start of collection rather than /// clearing the collection and adding new children) /// /// The object tree consists of following types: /// RawObject - Abstact base class for all types /// RawContainer - Abstact base class for all types that can contain child nodes /// RawDocument - The root object of the XML document /// RawElement - Logical grouping of other nodes together. The first child is always the start tag. /// RawTag - Represents any markup starting with "<" and (hopefully) ending with ">" /// RawAttribute - Name-value pair in a tag /// RawText - Whitespace or character data /// /// For example, see the following XML and the produced object tree: /// /// /// Make everything as simple as possible, but not simpler. /// /// /// RawDocument /// RawTag "" /// RawText " My favourite quote " /// RawElement /// RawTag "<" "quote" ">" /// RawText " " /// RawAttribute 'author="Albert Einstein"' /// RawText "\n Make everything as simple as possible, but not simpler.\n" /// RawTag "" /// ]]> /// /// The precise content of RawTag depends on what it represents: /// " | "/>") /// End tag: "" /// P.instr.: "" /// Comment: "" (Name is always null) /// DTD: "" (Name is always null) /// CData: "" (Name is always null) /// ]]> /// /// The type of tag can be identified by the opening backet. /// There are helpper properties in the RawTag class to identify the type, exactly /// one of the properties will be true. /// /// The closing bracket may be missing or may be different for mallformed XML. /// /// Note that there can always be multiple consequtive RawText nodes. /// This is to ensure that idividual texts are not too long. /// public class XmlParser { RawDocument userDocument = new RawDocument(); XDocument userLinqDocument; TextDocument textDocument; TextSegmentCollection parsedItems = new TextSegmentCollection(); List changesSinceLastParse = new List(); /// /// Create new parser, but do not parse the text yet. /// public XmlParser(TextDocument textDocument) { this.userLinqDocument = userDocument.GetXDocument(); this.textDocument = textDocument; this.textDocument.Changed += delegate(object sender, DocumentChangeEventArgs e) { changesSinceLastParse.Add(e); }; } /// /// Incrementaly parse the document /// public RawDocument Parse() { currentLocation = 0; input = textDocument.Text; foreach(DocumentChangeEventArgs change in changesSinceLastParse) { // Update offsets of all items parsedItems.UpdateOffsets(change); // Remove any items affected by the change int start = change.Offset - 2; int end = change.Offset + change.InsertionLength + 2; start = Math.Max(Math.Min(start, textDocument.TextLength - 1), 0); end = Math.Max(Math.Min(end, textDocument.TextLength - 1), 0); foreach(RawObject obj in parsedItems.FindOverlappingSegments(start, end - start)) { parsedItems.Remove(obj); Log("Removed cached item {0}", obj); } } changesSinceLastParse.Clear(); RawDocument parsedDocument = ReadDocument(); // Just in case parse method was called redundantly if (parsedDocument.ReadCallID != userDocument.ReadCallID) { PrintStringCacheStats(); RawObject.LogDom("Updating main DOM tree..."); } userDocument.UpdateDataFrom(parsedDocument); return userDocument; } T ReadFromCache(int location) where T: RawObject { RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(location); while(obj != null && obj.StartOffset == location) { if (obj is T) { currentLocation += obj.Length; return (T)obj; } obj = parsedItems.GetNextSegment(obj); } return null; } void Log(string text, params object[] pars) { System.Diagnostics.Debug.WriteLine("XML Parser: " + text, pars); } void LogParsed(RawObject obj) { System.Diagnostics.Debug.WriteLine("XML Parser: Parsed " + obj.ToString()); } Dictionary stringCache = new Dictionary(); int stringCacheRequestedCount; int stringCacheRequestedSize; int stringCacheSavedCount; int stringCacheSavedSize; string GetCachedString(string cached) { stringCacheRequestedCount += 1; stringCacheRequestedSize += 8 + 2 * cached.Length; // Do not bother with long strings //if (cached.Length <= 32) return cached; if (stringCache.ContainsKey(cached)) { // Get the instance from the cache instead stringCacheSavedCount += 1; stringCacheSavedSize += 8 + 2 * cached.Length; return stringCache[cached]; } else { // Add to cache stringCache.Add(cached, cached); return cached; } } void PrintStringCacheStats() { Log("String cache: Requested {0} ({1} bytes); Saved {2} ({3} bytes); {4}% Saved", stringCacheRequestedCount, stringCacheRequestedSize, stringCacheSavedCount, stringCacheSavedSize, stringCacheRequestedSize == 0 ? 0 : stringCacheSavedSize * 100 / stringCacheRequestedSize); } string input; int currentLocation; bool IsEndOfFile() { return currentLocation == input.Length; } bool HasMoreData() { return currentLocation < input.Length; } void AssertHasMoreData() { if (currentLocation == input.Length) { throw new Exception("Unexpected end of files"); } } // The methods start with 'try' to make it clear they can silently fail. // Read methods without 'try' have to succed or throw exception. // // For example: // while(true) TryMoveNext(); is obviously infinite loop // whereas // while(true) MoveNext(); should eventulay throw exception (if MoveNext it existed) // bool TryMoveNext() { if (currentLocation == input.Length) return false; currentLocation++; return true; } bool TryRead(char c) { if (currentLocation == input.Length) return false; if (input[currentLocation] == c) { currentLocation++; return true; } else { return false; } } bool TryRead(string text) { if (TryPeek(text)) { currentLocation += text.Length; return true; } else { return false; } } /// Returns true if at least one character was read bool TryReadPartOf(string text) { if (TryPeek(text[0])) { // Keep reading until character differs or we have end of file foreach(char c in text) if (!TryRead(c)) break; return true; } else { return false; } } bool TryPeek(char c) { if (currentLocation == input.Length) return false; return input[currentLocation] == c; } bool TryPeek(string text) { if (currentLocation + text.Length > input.Length) return false; return input.Substring(currentLocation, text.Length) == text; } bool TryMoveTo(char c) { while(true) { if (currentLocation == input.Length) return false; if (input[currentLocation] == c) return true; currentLocation++; } } bool TryMoveToAnyOf(params char[] c) { while(true) { if (currentLocation == input.Length) return false; if (c.Contains(input[currentLocation])) return true; currentLocation++; } } string GetText(int start, int end) { if (start == input.Length && end == input.Length) { return string.Empty; } else { return GetCachedString(input.Substring(start, end - start)); } } static char[] WhiteSpaceChars = new char[] {' ', '\n', '\r', '\t'}; static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '<', '=', '>', '/', '?'}; bool TryPeekWhiteSpace() { if (currentLocation == input.Length) return false; return WhiteSpaceChars.Contains(input[currentLocation]); } string ReadName() { AssertHasMoreData(); int start = currentLocation; TryMoveToAnyOf(WhiteSpaceAndReservedChars.ToArray()); return GetText(start, currentLocation); } RawDocument ReadDocument() { RawDocument doc = ReadFromCache(currentLocation); if (doc != null) return doc; doc = new RawDocument(); doc.StartOffset = currentLocation; while(true) { if (IsEndOfFile()) { break; } else if (TryPeek('<')) { doc.AddChild(ReadElementOrTag()); } else { doc.AddChild(ReadCharacterData()); } } doc.EndOffset = currentLocation; LogParsed(doc); parsedItems.Add(doc); return doc; } RawObject ReadElementOrTag() { AssertHasMoreData(); if (TryPeek("(currentLocation); if (element != null) return element; element = new RawElement(); element.StartOffset = currentLocation; // Read start tag element.AddChild(ReadTag()); Debug.Assert(element.StartTag.IsStartTag); // Read content and end tag if (element.StartTag.ClosingBracket == ">") { while(true) { if (IsEndOfFile()) { break; } else if (TryPeek('<')) { RawObject content = ReadElementOrTag(); if (content is RawTag && ((RawTag)content).IsEndTag) break; element.AddChild(content); } else { element.AddChild(ReadCharacterData()); } } } element.EndOffset = currentLocation; LogParsed(element); parsedItems.Add(element); return element; } // Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>") // End tag: "" // P.instr.: "" // Comment: "" (Name is always null) // CData: "" (Name is always null) // DTD: "" (Name is always null) RawTag ReadTag() { AssertHasMoreData(); RawTag tag = ReadFromCache(currentLocation); if (tag != null) return tag; tag = new RawTag(); tag.StartOffset = currentLocation; // Read the opening bracket // It identifies the type of tag and parsing behavior for the rest of it tag.OpeningBracket = ReadOpeningBracket(); // Read the name if (tag.IsStartTag || tag.IsEndTag || tag.IsProcessingInstruction) { if (HasMoreData()) { tag.Name = ReadName(); } } if (tag.IsStartTag || tag.IsEndTag || tag.IsProcessingInstruction) { // Read attributes for the tag while(true) { if (TryPeekWhiteSpace()) { tag.AddChild(ReadWhiteSpace()); } string bracket; if (TryReadClosingBracket(out bracket)) { tag.ClosingBracket = bracket; break; } if (TryPeek('<')) break; if (HasMoreData()) { tag.AddChild(ReadAttribulte()); continue; } break; // End of file } } else { // Simple tag types if (tag.IsComment) { // TODO: Be strict only if the opening bracket is complete tag.AddChildren(ReadTextUntil("-->").ToList()); } else if (tag.IsCData) { // TODO: Be strict only if the opening bracket is complete tag.AddChildren(ReadTextUntil("]]>").ToList()); } else if (tag.IsDocumentType) { // TODO: Nested definition tag.AddChildren(ReadTextUntil(">").ToList()); } string bracket; if (TryReadClosingBracket(out bracket)) { tag.ClosingBracket = bracket; } } tag.EndOffset = currentLocation; LogParsed(tag); parsedItems.Add(tag); return tag; } /// /// Reads any of the know opening brackets /// Also accepts them if they are incomplete; one charater is suffcient /// string ReadOpeningBracket() { // We are using a lot of string literals so that the memory instances are shared int start = currentLocation; if (TryRead('<')) { if (TryRead('/')) { return " /// Reads any of the know closing brackets /// Also accepts them if they are incomplete; one charater is suffcient /// bool TryReadClosingBracket(out string bracket) { // We are using a lot of string literals so that the memory instances are shared int start = currentLocation; if (TryRead('>')) { bracket = ">"; } else if (TryRead('/')) { if (TryRead('>')) { bracket = "/>"; } else { bracket = "/"; } } else if (TryRead('?')) { if (TryRead('>')) { bracket = "?>"; } else { bracket = "?"; } } else if (TryReadPartOf("-->")) { bracket = GetText(start, currentLocation); } else if (TryReadPartOf("]]>")) { bracket = GetText(start, currentLocation); } else { bracket = null; return false; } return true; } RawAttribute ReadAttribulte() { AssertHasMoreData(); RawAttribute attr = ReadFromCache(currentLocation); if (attr != null) return attr; attr = new RawAttribute(); attr.StartOffset = currentLocation; if (HasMoreData()) attr.Name = ReadName(); int checkpoint = currentLocation; attr.EqualsSign = string.Empty; if (TryPeekWhiteSpace()) attr.EqualsSign += ReadWhiteSpace().Value; if (TryRead('=')) { attr.EqualsSign += "="; if (TryPeekWhiteSpace()) attr.EqualsSign += ReadWhiteSpace().Value; // Read attribute value int start = currentLocation; if (TryRead('"')) { TryMoveToAnyOf('"', '<'); TryRead('"'); attr.Value = GetText(start, currentLocation); } else if (TryRead('\'')) { TryMoveToAnyOf('\'', '<'); TryRead('\''); attr.Value = GetText(start, currentLocation); } } else { attr.EqualsSign = null; currentLocation = checkpoint; } attr.EndOffset = currentLocation; parsedItems.Add(attr); return attr; } RawText ReadWhiteSpace() { AssertHasMoreData(); RawText ws = ReadFromCache(currentLocation); if (ws != null) return ws; ws = new RawText(); ws.StartOffset = currentLocation; int start = currentLocation; while(TryPeekWhiteSpace()) TryMoveNext(); ws.Value = GetText(start, currentLocation); ws.EndOffset = currentLocation; Debug.Assert(ws.Value.Length > 0); parsedItems.Add(ws); return ws; } RawText ReadCharacterData() { Debug.Assert(HasMoreData()); RawText charData = ReadFromCache(currentLocation); if (charData != null) return charData; charData = new RawText(); charData.StartOffset = currentLocation; int start = currentLocation; TryMoveTo('<'); charData.Value = GetText(start, currentLocation); charData.EndOffset = currentLocation; Debug.Assert(charData.Value.Length > 0); parsedItems.Add(charData); return charData; } IEnumerable ReadTextUntil(string closingText) { Debug.Assert(HasMoreData()); RawText charData = ReadFromCache(currentLocation); // TODO: How many return? Ensure the output is same as before if (charData != null) yield return charData; charData = new RawText(); charData.StartOffset = currentLocation; int start = currentLocation; while(true) { if (!TryMoveTo(closingText[0])) break; // End of file if (TryPeek(closingText)) break; // Match TryMoveNext(); } charData.Value = GetText(start, currentLocation); charData.EndOffset = currentLocation; Debug.Assert(charData.Value.Length > 0); parsedItems.Add(charData); yield return charData; } } }