You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1606 lines
49 KiB
1606 lines
49 KiB
// <file> |
|
// <copyright see="prj:///doc/copyright.txt"/> |
|
// <license see="prj:///doc/license.txt"/> |
|
// <owner name="David Srbecký" email="dsrbecky@gmail.com"/> |
|
// <version>$Revision$</version> |
|
// </file> |
|
|
|
using ICSharpCode.AvalonEdit.Utils; |
|
using System; |
|
using System.Collections.Generic; |
|
using System.Diagnostics; |
|
using System.Globalization; |
|
using System.Linq; |
|
using System.Text; |
|
using ICSharpCode.AvalonEdit.Document; |
|
|
|
namespace ICSharpCode.AvalonEdit.XmlParser |
|
{ |
|
/// <summary> |
|
/// Creates object tree from XML document. |
|
/// </summary> |
|
/// <remarks> |
|
/// The created tree fully describes the document and thus the orginal XML file can be |
|
/// exactly reproduced. |
|
/// |
|
/// Any further parses will reparse only the changed parts and the existing tree will |
|
/// be updated with the changes. The user can add event handlers to be notified of |
|
/// the changes. The parser tries to minimize the number of changes to the tree. |
|
/// (for example, it will add a single child at the start of collection rather than |
|
/// clearing the collection and adding new children) |
|
/// |
|
/// The object tree consists of following types: |
|
/// RawObject - Abstact base class for all types |
|
/// RawContainer - Abstact base class for all types that can contain child nodes |
|
/// RawDocument - The root object of the XML document |
|
/// RawElement - Logical grouping of other nodes together. The first child is always the start tag. |
|
/// RawTag - Represents any markup starting with "<" and (hopefully) ending with ">" |
|
/// RawAttribute - Name-value pair in a tag |
|
/// RawText - Whitespace or character data |
|
/// |
|
/// For example, see the following XML and the produced object tree: |
|
/// <![CDATA[ |
|
/// <!-- My favourite quote --> |
|
/// <quote author="Albert Einstein"> |
|
/// Make everything as simple as possible, but not simpler. |
|
/// </quote> |
|
/// |
|
/// RawDocument |
|
/// RawTag "<!--" "-->" |
|
/// RawText " My favourite quote " |
|
/// RawElement |
|
/// RawTag "<" "quote" ">" |
|
/// RawText " " |
|
/// RawAttribute 'author="Albert Einstein"' |
|
/// RawText "\n Make everything as simple as possible, but not simpler.\n" |
|
/// RawTag "</" "quote" ">" |
|
/// ]]> |
|
/// |
|
/// The precise content of RawTag depends on what it represents: |
|
/// <![CDATA[ |
|
/// Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>") |
|
/// End tag: "</" Name? (RawText+ RawAttribute)* RawText* ">" |
|
/// P.instr.: "<?" Name? (RawText)* "?>" |
|
/// Comment: "<!--" (RawText)* "-->" |
|
/// CData: "<![CDATA[" (RawText)* "]]" ">" |
|
/// DTD: "<!DOCTYPE" (RawText+ RawTag)* RawText* ">" (DOCTYPE or other DTD names) |
|
/// UknownBang: "<!" (RawText)* ">" |
|
/// ]]> |
|
/// |
|
/// The type of tag can be identified by the opening backet. |
|
/// There are helpper properties in the RawTag class to identify the type, exactly |
|
/// one of the properties will be true. |
|
/// |
|
/// The closing bracket may be missing or may be different for mallformed XML. |
|
/// |
|
/// Note that there can always be multiple consequtive RawText nodes. |
|
/// This is to ensure that idividual texts are not too long. |
|
/// |
|
/// XML Spec: http://www.w3.org/TR/xml/ |
|
/// XML EBNF: http://www.jelks.nu/XML/xmlebnf.html |
|
/// |
|
/// Internals: |
|
/// |
|
/// "Try" methods can silently fail by returning false. |
|
/// MoveTo methods do not move if they are already at the given target |
|
/// If methods return some object, it must be no-empty. It is up to the caller to ensure |
|
/// the context is appropriate for reading. |
|
/// |
|
/// </remarks> |
|
public class XmlParser |
|
{ |
|
RawDocument userDocument; |
|
TextDocument textDocument; |
|
|
|
List<DocumentChangeEventArgs> changesSinceLastParse = new List<DocumentChangeEventArgs>(); |
|
|
|
/// <summary> Previously parsed items as long as they are valid </summary> |
|
TextSegmentCollection<RawObject> parsedItems = new TextSegmentCollection<RawObject>(); |
|
|
|
/// <summary> |
|
/// Is used to identify what memory range was touched by object |
|
/// The default is (StartOffset, EndOffset + 1) which is not stored |
|
/// </summary> |
|
TextSegmentCollection<TouchedMemoryRange> touchedMemoryRanges = new TextSegmentCollection<TouchedMemoryRange>(); |
|
|
|
class TouchedMemoryRange: TextSegment |
|
{ |
|
public RawObject TouchedByObject { get; set; } |
|
} |
|
|
|
/// <summary> |
|
/// Generate syntax error when seeing enity reference other then the build-in ones |
|
/// </summary> |
|
public bool EntityReferenceIsError { get; set; } |
|
|
|
/// <summary> |
|
/// Create new parser, but do not parse the text yet. |
|
/// </summary> |
|
public XmlParser(string input) |
|
{ |
|
this.input = input; |
|
this.userDocument = new RawDocument() { Parser = this }; |
|
this.EntityReferenceIsError = true; |
|
} |
|
|
|
/// <summary> |
|
/// Create new parser, but do not parse the text yet. |
|
/// </summary> |
|
public XmlParser(TextDocument textDocument): this(textDocument.Text) |
|
{ |
|
this.textDocument = textDocument; |
|
this.textDocument.Changed += delegate(object sender, DocumentChangeEventArgs e) { |
|
changesSinceLastParse.Add(e); |
|
}; |
|
} |
|
|
|
/// <summary> Throws exception if condition is false </summary> |
|
protected static void Assert(bool condition, string message) |
|
{ |
|
if (!condition) { |
|
throw new Exception("Assertion failed: " + message); |
|
} |
|
} |
|
|
|
/// <summary> Throws exception if condition is false </summary> |
|
[Conditional("DEBUG")] |
|
protected static void DebugAssert(bool condition, string message) |
|
{ |
|
if (!condition) { |
|
throw new Exception("Assertion failed: " + message); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Incrementaly parse the document |
|
/// </summary> |
|
public RawDocument Parse() |
|
{ |
|
// Update source text |
|
if (textDocument != null) { |
|
input = textDocument.Text; |
|
} |
|
|
|
// Use chages to invalidate cache |
|
foreach(DocumentChangeEventArgs change in changesSinceLastParse) { |
|
// Update offsets of all items |
|
parsedItems.UpdateOffsets(change); |
|
touchedMemoryRanges.UpdateOffsets(change); |
|
|
|
// Remove any items affected by the change |
|
Log("Changed offset {0}", change.Offset); |
|
// Removing will cause one of the ends to be set to change.Offset |
|
// FindSegmentsContaining includes any segments touching |
|
// so that conviniently takes care of the +1 byte |
|
foreach(RawObject obj in parsedItems.FindSegmentsContaining(change.Offset)) { |
|
RemoveFromCache(obj, false); |
|
} |
|
foreach(TouchedMemoryRange memory in touchedMemoryRanges.FindSegmentsContaining(change.Offset)) { |
|
Log("Found that {0} dependeds on memory ({1}-{2})", memory.TouchedByObject, memory.StartOffset, memory.EndOffset); |
|
RemoveFromCache(memory.TouchedByObject, true); |
|
touchedMemoryRanges.Remove(memory); |
|
} |
|
} |
|
changesSinceLastParse.Clear(); |
|
|
|
currentLocation = 0; |
|
maxTouchedLocation = 0; |
|
inputLength = input.Length; |
|
|
|
RawDocument parsedDocument = ReadDocument(); |
|
parsedDocument.DebugCheckConsistency(true); |
|
// Just in case parse method was called redundantly |
|
PrintStringCacheStats(); |
|
RawObject.LogDom("Updating main DOM tree..."); |
|
userDocument.UpdateTreeFrom(parsedDocument); |
|
userDocument.DebugCheckConsistency(false); |
|
return userDocument; |
|
} |
|
|
|
List<RawObject> FindParents(RawObject child) |
|
{ |
|
List<RawObject> parents = new List<RawObject>(); |
|
foreach(RawObject parent in parsedItems.FindSegmentsContaining(child.StartOffset)) { |
|
// Parent is anyone wholy containg the child |
|
if (parent.StartOffset <= child.StartOffset && child.EndOffset <= parent.EndOffset && parent != child) { |
|
parents.Add(parent); |
|
} |
|
} |
|
return parents; |
|
} |
|
|
|
/// <summary> Remove from cache including all parents </summary> |
|
internal void RemoveFromCache(RawObject obj, bool includeParents) |
|
{ |
|
if (includeParents) { |
|
List<RawObject> parents = FindParents(obj); |
|
|
|
foreach(RawObject r in parents) { |
|
if (parsedItems.Remove(r)) { |
|
r.IsInCache = false; |
|
Log("Removing cached item {0} (it is parent)", r); |
|
} |
|
} |
|
} |
|
|
|
if (parsedItems.Remove(obj)) { |
|
obj.IsInCache = false; |
|
Log("Removed cached item {0}", obj); |
|
} |
|
} |
|
|
|
bool TryReadFromCacheOrNew<T>(out T res) where T: RawObject, new() |
|
{ |
|
return TryReadFromCacheOrNew<T>(x => true, out res); |
|
} |
|
|
|
bool TryReadFromCacheOrNew<T>(Predicate<T> conditon, out T res) where T: RawObject, new() |
|
{ |
|
RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(currentLocation); |
|
while(obj != null && obj.StartOffset == currentLocation) { |
|
if (obj is T && conditon((T)obj)) { |
|
currentLocation += obj.Length; |
|
res = (T)obj; |
|
return true; |
|
} |
|
obj = parsedItems.GetNextSegment(obj); |
|
} |
|
res = new T(); |
|
return false; |
|
} |
|
|
|
int GetStartOfCachedObject<T>(Predicate<T> conditon, int offset, int lookaheadCount) where T: RawObject |
|
{ |
|
RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(offset); |
|
// Recheck the offset! |
|
while(obj != null && offset <= obj.StartOffset && obj.StartOffset <= offset + lookaheadCount) { |
|
if (obj is T && conditon((T)obj)) { |
|
return obj.StartOffset; |
|
} |
|
obj = parsedItems.GetNextSegment(obj); |
|
} |
|
return -1; |
|
} |
|
|
|
void OnParsed(RawObject obj) |
|
{ |
|
if (obj.Length == 0 && !(obj is RawDocument)) |
|
throw new Exception(string.Format("Could not parse {0}. It has zero length.", obj)); |
|
if (obj is RawContainer) { |
|
foreach(RawObject child in ((RawContainer)obj).Children) { |
|
if (!(obj.StartOffset <= child.StartOffset && child.EndOffset <= obj.EndOffset)) |
|
throw new Exception("Wrong nesting"); |
|
} |
|
} |
|
parsedItems.Add(obj); |
|
obj.IsInCache = true; |
|
Log("Parsed {0}", obj); |
|
if (maxTouchedLocation > currentLocation) { |
|
// location is assumed to be read so the range ends at (location + 1) |
|
// For example eg for "a_" it is (0-2) |
|
TouchedMemoryRange memRange = new TouchedMemoryRange() { |
|
StartOffset = obj.StartOffset, |
|
Length = (maxTouchedLocation + 1 - obj.StartOffset), |
|
TouchedByObject = obj |
|
}; |
|
touchedMemoryRanges.Add(memRange); |
|
Log(" - Touched memory range: ({0}-{1})", memRange.StartOffset, memRange.EndOffset); |
|
} |
|
} |
|
|
|
void Log(string text, params object[] pars) |
|
{ |
|
System.Diagnostics.Debug.WriteLine(string.Format("XML Parser: " + text, pars)); |
|
} |
|
|
|
Dictionary<string, string> stringCache = new Dictionary<string, string>(); |
|
int stringCacheRequestedCount; |
|
int stringCacheRequestedSize; |
|
int stringCacheStoredCount; |
|
int stringCacheStoredSize; |
|
|
|
string GetCachedString(string cached) |
|
{ |
|
stringCacheRequestedCount += 1; |
|
stringCacheRequestedSize += 8 + 2 * cached.Length; |
|
// Do not bother with long strings |
|
if (cached.Length <= 32) return cached; |
|
if (stringCache.ContainsKey(cached)) { |
|
// Get the instance from the cache instead |
|
return stringCache[cached]; |
|
} else { |
|
// Add to cache |
|
stringCacheStoredCount += 1; |
|
stringCacheStoredSize += 8 + 2 * cached.Length; |
|
stringCache.Add(cached, cached); |
|
return cached; |
|
} |
|
} |
|
|
|
void PrintStringCacheStats() |
|
{ |
|
Log("String cache: Requested {0} ({1} bytes); Actaully stored {2} ({3} bytes); {4}% stored", stringCacheRequestedCount, stringCacheRequestedSize, stringCacheStoredCount, stringCacheStoredSize, stringCacheRequestedSize == 0 ? 0 : stringCacheStoredSize * 100 / stringCacheRequestedSize); |
|
} |
|
|
|
void OnSyntaxError(RawObject obj, string message, params object[] args) |
|
{ |
|
OnSyntaxError(obj, currentLocation, currentLocation + 1, message, args); |
|
} |
|
|
|
void OnSyntaxError(RawObject obj, int start, int end, string message, params object[] args) |
|
{ |
|
if (end <= start) end = start + 1; |
|
Log("Syntax error ({0}-{1}): {2}", start, end, string.Format(message, args)); |
|
obj.AddSyntaxError(new SyntaxError() { |
|
Object = obj, |
|
StartOffset = start, |
|
EndOffset = end, |
|
Message = string.Format(message, args), |
|
}); |
|
} |
|
|
|
#region Text reading methods |
|
|
|
string input; |
|
int inputLength; |
|
// Do not ever set the value from parsing methods |
|
// most importantly do not backtrack except with GoBack(int) |
|
int currentLocation; |
|
|
|
// CurrentLocation is assumed to be touched and that fact does not |
|
// have to be recorded in this variable |
|
// This stores any value bigger then that if applicable |
|
// acutal value is max(currentLocation, maxTouchedLocation) |
|
int maxTouchedLocation; |
|
|
|
bool IsEndOfFile() |
|
{ |
|
return currentLocation == inputLength; |
|
} |
|
|
|
void AssertIsEndOfFile() |
|
{ |
|
Assert(IsEndOfFile(), "End of file expected at this point"); |
|
} |
|
|
|
bool HasMoreData() |
|
{ |
|
return currentLocation < inputLength; |
|
} |
|
|
|
void AssertHasMoreData() |
|
{ |
|
Assert(HasMoreData(), "Unexpected end of file"); |
|
} |
|
|
|
bool TryMoveNext() |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
|
|
currentLocation++; |
|
return true; |
|
} |
|
|
|
void GoBack(int oldLocation) |
|
{ |
|
if (oldLocation > currentLocation) throw new Exception("Trying to move forward"); |
|
maxTouchedLocation = Math.Max(maxTouchedLocation, currentLocation); |
|
currentLocation = oldLocation; |
|
} |
|
|
|
bool TryRead(char c) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
|
|
if (input[currentLocation] == c) { |
|
currentLocation++; |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
bool TryReadAnyOf(params char[] c) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
|
|
if (c.Contains(input[currentLocation])) { |
|
currentLocation++; |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
bool TryRead(string text) |
|
{ |
|
if (TryPeek(text)) { |
|
currentLocation += text.Length; |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
bool TryPeekPrevious(char c, int back) |
|
{ |
|
if (currentLocation - back == inputLength) return false; |
|
if (currentLocation - back < 0 ) return false; |
|
|
|
return input[currentLocation - back] == c; |
|
} |
|
|
|
bool TryPeek(char c) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
|
|
return input[currentLocation] == c; |
|
} |
|
|
|
bool TryPeekAnyOf(params char[] chars) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
|
|
return chars.Contains(input[currentLocation]); |
|
} |
|
|
|
bool TryPeek(string text) |
|
{ |
|
if (!TryPeek(text[0])) return false; // Early exit |
|
|
|
maxTouchedLocation = Math.Max(maxTouchedLocation, currentLocation + (text.Length - 1)); |
|
// The following comparison 'touches' the end of file - it does depend on the end being there |
|
if (currentLocation + text.Length > inputLength) return false; |
|
|
|
return input.Substring(currentLocation, text.Length) == text; |
|
} |
|
|
|
bool TryPeekWhiteSpace() |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
|
|
char c = input[currentLocation]; |
|
return c == ' ' || c == '\t' || c == '\n' || c == '\r'; |
|
} |
|
|
|
// The move functions do not have to move if already at target |
|
// The move functions allow 'overriding' of the document length |
|
|
|
bool TryMoveTo(char c) |
|
{ |
|
return TryMoveTo(c, inputLength); |
|
} |
|
|
|
bool TryMoveTo(char c, int inputLength) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
int index = input.IndexOf(c, currentLocation, inputLength - currentLocation); |
|
if (index != -1) { |
|
currentLocation = index; |
|
return true; |
|
} else { |
|
currentLocation = inputLength; |
|
return false; |
|
} |
|
} |
|
|
|
bool TryMoveToAnyOf(params char[] c) |
|
{ |
|
return TryMoveToAnyOf(c, inputLength); |
|
} |
|
|
|
bool TryMoveToAnyOf(char[] c, int inputLength) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
int index = input.IndexOfAny(c, currentLocation, inputLength - currentLocation); |
|
if (index != -1) { |
|
currentLocation = index; |
|
return true; |
|
} else { |
|
currentLocation = inputLength; |
|
return false; |
|
} |
|
} |
|
|
|
bool TryMoveTo(string text) |
|
{ |
|
return TryMoveTo(text, inputLength); |
|
} |
|
|
|
bool TryMoveTo(string text, int inputLength) |
|
{ |
|
if (currentLocation == inputLength) return false; |
|
int index = input.IndexOf(text, currentLocation, inputLength - currentLocation, StringComparison.Ordinal); |
|
if (index != -1) { |
|
maxTouchedLocation = index + text.Length - 1; |
|
currentLocation = index; |
|
return true; |
|
} else { |
|
currentLocation = inputLength; |
|
return false; |
|
} |
|
} |
|
|
|
bool TryMoveToNonWhiteSpace() |
|
{ |
|
return TryMoveToNonWhiteSpace(inputLength); |
|
} |
|
|
|
bool TryMoveToNonWhiteSpace(int inputLength) |
|
{ |
|
while(TryPeekWhiteSpace()) currentLocation++; |
|
return HasMoreData(); |
|
} |
|
|
|
/// <summary> |
|
/// Read a name token. |
|
/// The following characters are not allowed: |
|
/// "" End of file |
|
/// " \n\r\t" Whitesapce |
|
/// "=\'\"" Attribute value |
|
/// "<>/?" Tags |
|
/// </summary> |
|
/// <returns> True if read at least one character </returns> |
|
bool TryReadName(out string res) |
|
{ |
|
int start = currentLocation; |
|
// Keep reading up to invalid character |
|
while(true) { |
|
if (currentLocation == inputLength) break; // Reject end of file |
|
char c = input[currentLocation]; |
|
if (0x41 <= (int)c && (int)c <= 0x7A) { // Accpet 0x41-0x7A (A-Z[\]^_`a-z) |
|
currentLocation++; |
|
continue; |
|
} |
|
if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || // Reject whitesapce |
|
c == '=' || c == '\'' || c == '"' || // Reject attributes |
|
c == '<' || c == '>' || c == '/' || c == '?') { // Reject tags |
|
break; |
|
} else { |
|
currentLocation++; |
|
continue; // Accept other character |
|
} |
|
} |
|
if (start == currentLocation) { |
|
res = string.Empty; |
|
return false; |
|
} else { |
|
res = GetText(start, currentLocation); |
|
return true; |
|
} |
|
} |
|
|
|
string GetText(int start, int end) |
|
{ |
|
if (end > currentLocation) throw new Exception("Reading ahead of current location"); |
|
if (start == inputLength && end == inputLength) { |
|
return string.Empty; |
|
} else { |
|
return GetCachedString(input.Substring(start, end - start)); |
|
} |
|
} |
|
|
|
#endregion |
|
|
|
/// <summary> |
|
/// Get flat hiearchy of the document. |
|
/// Returns only Text, Tag or properly nested Element |
|
/// </summary> |
|
List<RawObject> ReadFlatObjects() |
|
{ |
|
List<RawObject> stream = new List<RawObject>(); |
|
|
|
while(true) { |
|
if (IsEndOfFile()) { |
|
break; |
|
} else if (TryPeek('<')) { |
|
RawElement elem; |
|
if (TryReadFromCacheOrNew(e => e.IsProperlyNested, out elem)) { |
|
stream.Add(elem); |
|
} else { |
|
stream.Add(ReadTag()); |
|
} |
|
} else { |
|
stream.AddRange(ReadText(RawTextType.CharacterData)); |
|
} |
|
} |
|
|
|
return stream; |
|
} |
|
|
|
/// <summary> |
|
/// Context: "<" |
|
/// </summary> |
|
RawTag ReadTag() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
RawTag tag; |
|
if (TryReadFromCacheOrNew(out tag)) return tag; |
|
|
|
tag.StartOffset = currentLocation; |
|
|
|
// Read the opening bracket |
|
// It identifies the type of tag and parsing behavior for the rest of it |
|
tag.OpeningBracket = ReadOpeningBracket(); |
|
|
|
if (tag.IsStartOrEmptyTag || tag.IsEndTag || tag.IsProcessingInstruction) { |
|
// Read the name |
|
string name; |
|
if (TryReadName(out name)) { |
|
if (!IsValidName(name)) { |
|
OnSyntaxError(tag, currentLocation - name.Length, currentLocation, "The name '{0}' is invalid", name); |
|
} |
|
} else { |
|
OnSyntaxError(tag, "Element name expected"); |
|
} |
|
tag.Name = name; |
|
} |
|
|
|
if (tag.IsStartOrEmptyTag || tag.IsEndTag) { |
|
// Read attributes for the tag |
|
while(true) { |
|
// Chech for all forbiden 'name' charcters first - see ReadName |
|
if (IsEndOfFile()) break; |
|
if (TryPeekWhiteSpace()) { |
|
tag.AddChildren(ReadText(RawTextType.WhiteSpace)); |
|
continue; // End of file might be next |
|
} |
|
if (TryPeek('<')) break; |
|
if (TryPeek('>') || TryPeek('/') || TryPeek('?')) break; // End tag |
|
|
|
// We have "=\'\"" or name - read attribute |
|
tag.AddChild(ReadAttribulte()); |
|
} |
|
} else if (tag.IsDocumentType) { |
|
tag.AddChildren(ReadContentOfDTD()); |
|
} else { |
|
int start = currentLocation; |
|
IEnumerable<RawObject> text; |
|
if (tag.IsComment) { |
|
text = ReadText(RawTextType.Comment); |
|
} else if (tag.IsCData) { |
|
text = ReadText(RawTextType.CData); |
|
} else if (tag.IsProcessingInstruction) { |
|
text = ReadText(RawTextType.ProcessingInstruction); |
|
} else if (tag.IsUnknownBang) { |
|
text = ReadText(RawTextType.UnknownBang); |
|
} else { |
|
throw new Exception(string.Format("Unknown opening bracket '{0}'", tag.OpeningBracket)); |
|
} |
|
// Enumerate |
|
text = text.ToList(); |
|
// Backtrack at complete start |
|
if (IsEndOfFile() || (tag.IsUnknownBang && TryPeek('<'))) { |
|
GoBack(start); |
|
} else { |
|
tag.AddChildren(text); |
|
} |
|
} |
|
|
|
// Read closing bracket |
|
string bracket; |
|
TryReadClosingBracket(out bracket); |
|
tag.ClosingBracket = bracket; |
|
|
|
// Error check |
|
int brStart = currentLocation - (tag.ClosingBracket ?? string.Empty).Length; |
|
if (tag.Name == null) { |
|
// One error was reported already |
|
} else if (tag.IsStartOrEmptyTag) { |
|
if (tag.ClosingBracket != ">" && tag.ClosingBracket != "/>") OnSyntaxError(tag, brStart, currentLocation, "'>' or '/>' expected"); |
|
} else if (tag.IsEndTag) { |
|
if (tag.ClosingBracket != ">") OnSyntaxError(tag, brStart, currentLocation, "'>' expected"); |
|
} else if (tag.IsComment) { |
|
if (tag.ClosingBracket != "-->") OnSyntaxError(tag, brStart, currentLocation, "'-->' expected"); |
|
} else if (tag.IsCData) { |
|
if (tag.ClosingBracket != "]]>") OnSyntaxError(tag, brStart, currentLocation, "']]>' expected"); |
|
} else if (tag.IsProcessingInstruction) { |
|
if (tag.ClosingBracket != "?>") OnSyntaxError(tag, brStart, currentLocation, "'?>' expected"); |
|
} else if (tag.IsUnknownBang) { |
|
if (tag.ClosingBracket != ">") OnSyntaxError(tag, brStart, currentLocation, "'>' expected"); |
|
} else if (tag.IsDocumentType) { |
|
if (tag.ClosingBracket != ">") OnSyntaxError(tag, brStart, currentLocation, "'>' expected"); |
|
} else { |
|
throw new Exception(string.Format("Unknown opening bracket '{0}'", tag.OpeningBracket)); |
|
} |
|
|
|
// Attribute name may not apper multiple times |
|
var duplicates = tag.Children.OfType<RawAttribute>().GroupBy(attr => attr.Name).SelectMany(g => g.Skip(1)); |
|
foreach(RawAttribute attr in duplicates) { |
|
OnSyntaxError(tag, attr.StartOffset, attr.EndOffset, "Attribute with name '{0}' already exists", attr.Name); |
|
} |
|
|
|
tag.EndOffset = currentLocation; |
|
|
|
OnParsed(tag); |
|
return tag; |
|
} |
|
|
|
/// <summary> |
|
/// Reads any of the know opening brackets. (only full bracket) |
|
/// Context: "<" |
|
/// </summary> |
|
string ReadOpeningBracket() |
|
{ |
|
// We are using a lot of string literals so that the memory instances are shared |
|
int start = currentLocation; |
|
if (TryRead('<')) { |
|
if (TryRead('/')) { |
|
return "</"; |
|
} else if (TryRead('?')) { |
|
return "<?"; |
|
} else if (TryRead('!')) { |
|
if (TryRead("--")) { |
|
return "<!--"; |
|
} else if (TryRead("[CDATA[")) { |
|
return "<![CDATA["; |
|
} else { |
|
foreach(string dtdName in RawTag.DTDNames) { |
|
// the dtdName includes "<!" |
|
if (TryRead(dtdName.Remove(0, 2))) return dtdName; |
|
} |
|
return "<!"; |
|
} |
|
} else { |
|
return "<"; |
|
} |
|
} else { |
|
throw new Exception("'<' expected"); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Reads any of the know closing brackets. (only full bracket) |
|
/// Context: any |
|
/// </summary> |
|
bool TryReadClosingBracket(out string bracket) |
|
{ |
|
// We are using a lot of string literals so that the memory instances are shared |
|
int start = currentLocation; |
|
if (TryRead('>')) { |
|
bracket = ">"; |
|
} else if (TryRead("/>")) { |
|
bracket = "/>"; |
|
} else if (TryRead("?>")) { |
|
bracket = "?>"; |
|
} else if (TryRead("-->")) { |
|
bracket = "-->"; |
|
} else if (TryRead("]]>")) { |
|
bracket = "]]>"; |
|
} else { |
|
bracket = string.Empty; |
|
return false; |
|
} |
|
return true; |
|
} |
|
|
|
IEnumerable<RawObject> ReadContentOfDTD() |
|
{ |
|
int start = currentLocation; |
|
while(true) { |
|
if (IsEndOfFile()) break; // End of file |
|
TryMoveToNonWhiteSpace(); // Skip whitespace |
|
if (TryRead('\'')) TryMoveTo('\''); // Skip single quoted string TODO: Bug |
|
if (TryRead('\"')) TryMoveTo('\"'); // Skip single quoted string |
|
if (TryRead('[')) { // Start of nested infoset |
|
// Reading infoset |
|
while(true) { |
|
if (IsEndOfFile()) break; |
|
TryMoveToAnyOf('<', ']'); |
|
if (TryPeek('<')) { |
|
if (start != currentLocation) { // Two following tags |
|
yield return MakeText(start, currentLocation); |
|
} |
|
yield return ReadTag(); |
|
start = currentLocation; |
|
} |
|
if (TryPeek(']')) break; |
|
} |
|
} |
|
TryRead(']'); // End of nested infoset |
|
if (TryPeek('>')) break; // Proper closing |
|
if (TryPeek('<')) break; // Malformed XML |
|
TryMoveNext(); // Skip anything else |
|
} |
|
if (start != currentLocation) { |
|
yield return MakeText(start, currentLocation); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Context: name or "=\'\"" |
|
/// </summary> |
|
RawAttribute ReadAttribulte() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
RawAttribute attr; |
|
if (TryReadFromCacheOrNew(out attr)) return attr; |
|
|
|
attr.StartOffset = currentLocation; |
|
|
|
// Read name |
|
string name; |
|
if (TryReadName(out name)) { |
|
if (!IsValidName(name)) { |
|
OnSyntaxError(attr, currentLocation - name.Length, currentLocation, "The name '{0}' is invalid", name); |
|
} |
|
} else { |
|
OnSyntaxError(attr, "Attribute name expected"); |
|
} |
|
attr.Name = name; |
|
|
|
// Read equals sign and surrounding whitespace |
|
int checkpoint = currentLocation; |
|
TryMoveToNonWhiteSpace(); |
|
if (TryRead('=')) { |
|
int chk2 = currentLocation; |
|
TryMoveToNonWhiteSpace(); |
|
if (!TryPeek('"') && !TryPeek('\'')) { |
|
// Do not read whitespace if quote does not follow |
|
GoBack(chk2); |
|
} |
|
attr.EqualsSign = GetText(checkpoint, currentLocation); |
|
} else { |
|
GoBack(checkpoint); |
|
OnSyntaxError(attr, "'=' expected"); |
|
attr.EqualsSign = string.Empty; |
|
} |
|
|
|
// Read attribute value |
|
int start = currentLocation; |
|
char quoteChar = TryPeek('"') ? '"' : '\''; |
|
bool startsWithQuote; |
|
if (TryRead(quoteChar)) { |
|
startsWithQuote = true; |
|
int valueStart = currentLocation; |
|
TryMoveToAnyOf(quoteChar, '<'); |
|
if (TryRead(quoteChar)) { |
|
if (!TryPeekAnyOf(' ', '\t', '\n', '\r', '/', '>', '?')) { |
|
if (TryPeekPrevious('=', 2) || (TryPeekPrevious('=', 3) && TryPeekPrevious(' ', 2))) { |
|
// This actually most likely means that we are in the next attribute value |
|
GoBack(valueStart); |
|
ReadAttributeValue(quoteChar); |
|
if (TryRead(quoteChar)) { |
|
OnSyntaxError(attr, "White space or end of tag expected"); |
|
} else { |
|
OnSyntaxError(attr, "Quote {0} expected (or add whitespace after the following one)", quoteChar); |
|
} |
|
} else { |
|
OnSyntaxError(attr, "White space or end of tag expected"); |
|
} |
|
} |
|
} else { |
|
// '<' or end of file |
|
GoBack(valueStart); |
|
ReadAttributeValue(quoteChar); |
|
OnSyntaxError(attr, "Quote {0} expected", quoteChar); |
|
} |
|
} else { |
|
startsWithQuote = false; |
|
int valueStart = currentLocation; |
|
ReadAttributeValue(null); |
|
TryRead('\"'); |
|
TryRead('\''); |
|
if (valueStart == currentLocation) { |
|
OnSyntaxError(attr, "Attribute value expected"); |
|
} else { |
|
OnSyntaxError(attr, valueStart, currentLocation, "Attribute value must be quoted"); |
|
} |
|
} |
|
attr.QuotedValue = GetText(start, currentLocation); |
|
attr.Value = Unquote(attr.QuotedValue); |
|
attr.Value = Dereference(attr, attr.Value, startsWithQuote ? start + 1 : start); |
|
|
|
attr.EndOffset = currentLocation; |
|
|
|
OnParsed(attr); |
|
return attr; |
|
} |
|
|
|
/// <summary> |
|
/// Read everything up to quote (excluding), opening/closing tag or attribute signature |
|
/// </summary> |
|
void ReadAttributeValue(char? quote) |
|
{ |
|
while(true) { |
|
if (IsEndOfFile()) return; |
|
// What is next? |
|
int start = currentLocation; |
|
TryMoveToNonWhiteSpace(); // Read white space (if any) |
|
if (quote.HasValue) { |
|
if (TryPeek(quote.Value)) return; |
|
} else { |
|
if (TryPeek('"') || TryPeek('\'')) return; |
|
} |
|
// Opening/closing tag |
|
if (TryPeekAnyOf('<', '/', '>')) { |
|
GoBack(start); |
|
return; |
|
} |
|
// Try reading attribute signature |
|
string name; |
|
if (TryReadName(out name)) { |
|
int nameEnd = currentLocation; |
|
if (TryMoveToNonWhiteSpace() && TryRead("=") && |
|
TryMoveToNonWhiteSpace() && TryPeekAnyOf('"', '\'')) |
|
{ |
|
// Start of attribute. Great |
|
GoBack(start); |
|
return; // Done |
|
} else { |
|
// Just some gargabe - make it part of the value |
|
GoBack(nameEnd); |
|
continue; // Read more |
|
} |
|
} |
|
TryMoveNext(); // Accept everyting else |
|
} |
|
} |
|
|
|
RawText MakeText(int start, int end) |
|
{ |
|
DebugAssert(end > start, "Empty text"); |
|
|
|
RawText text = new RawText() { |
|
StartOffset = start, |
|
EndOffset = end, |
|
EscapedValue = GetText(start, end), |
|
Type = RawTextType.Other |
|
}; |
|
|
|
OnParsed(text); |
|
return text; |
|
} |
|
|
|
const int maxEntityLength = 12; // The longest build-in one is 10 ("") |
|
const int maxTextFragmentSize = 8; |
|
const int lookAheadLenght = (3 * maxTextFragmentSize) / 2; // More so that we do not get small "what was inserted" fragments |
|
|
|
/// <summary> |
|
/// Reads text and optionaly separates it into fragments. |
|
/// It can also return empty set for no appropriate text input. |
|
/// Make sure you enumerate it only once |
|
/// </summary> |
|
IEnumerable<RawObject> ReadText(RawTextType type) |
|
{ |
|
bool lookahead = false; |
|
while(true) { |
|
RawText text; |
|
if (TryReadFromCacheOrNew(t => t.Type == type, out text)) { |
|
// Cached text found |
|
yield return text; |
|
continue; // Read next fragment; the method can handle "no text left" |
|
} |
|
text.Type = type; |
|
|
|
// Limit the reading to just a few characters |
|
// (the first character not to be read) |
|
int fragmentEnd = Math.Min(currentLocation + maxTextFragmentSize, inputLength); |
|
|
|
// Look if some futher text has been already processed and align so that |
|
// we hit that chache point. It is expensive so it is off for the first run |
|
if (lookahead) { |
|
int nextFragmentIndex = GetStartOfCachedObject<RawText>(t => t.Type == type, currentLocation, lookAheadLenght); |
|
// Found and would fit whole entity |
|
if (nextFragmentIndex != -1 && nextFragmentIndex > currentLocation + maxEntityLength) { |
|
fragmentEnd = Math.Min(nextFragmentIndex, inputLength); |
|
Log("Parsing only text ({0}-{1}) because later text was already processed", currentLocation, fragmentEnd); |
|
} |
|
} |
|
lookahead = true; |
|
|
|
text.StartOffset = currentLocation; |
|
int start = currentLocation; |
|
|
|
// Try move to the terminator given by the context |
|
if (type == RawTextType.WhiteSpace) { |
|
TryMoveToNonWhiteSpace(fragmentEnd); |
|
} else if (type == RawTextType.CharacterData) { |
|
while(true) { |
|
if (!TryMoveToAnyOf(new char[] {'<', ']'}, fragmentEnd)) break; // End of fragment |
|
if (TryPeek('<')) break; |
|
if (TryPeek(']')) { |
|
if (TryPeek("]]>")) { |
|
OnSyntaxError(text, currentLocation, currentLocation + 3, "']]>' is not allowed in text"); |
|
} |
|
TryMoveNext(); |
|
continue; |
|
} |
|
throw new Exception("Infinite loop"); |
|
} |
|
} else if (type == RawTextType.Comment) { |
|
// Do not report too many errors |
|
bool errorReported = false; |
|
while(true) { |
|
if (!TryMoveTo('-', fragmentEnd)) break; // End of fragment |
|
if (TryPeek("-->")) break; |
|
if (TryPeek("--") && !errorReported) { |
|
OnSyntaxError(text, currentLocation, currentLocation + 2, "'--' is not allowed in comment"); |
|
errorReported = true; |
|
} |
|
TryMoveNext(); |
|
} |
|
} else if (type == RawTextType.CData) { |
|
while(true) { |
|
// We can not use use TryMoveTo("]]>", fragmentEnd) because it may incorectly accept "]" at the end of fragment |
|
if (!TryMoveTo(']', fragmentEnd)) break; // End of fragment |
|
if (TryPeek("]]>")) break; |
|
TryMoveNext(); |
|
} |
|
} else if (type == RawTextType.ProcessingInstruction) { |
|
while(true) { |
|
if (!TryMoveTo('?', fragmentEnd)) break; // End of fragment |
|
if (TryPeek("?>")) break; |
|
TryMoveNext(); |
|
} |
|
} else if (type == RawTextType.UnknownBang) { |
|
TryMoveToAnyOf(new char[] {'<', '>'}, fragmentEnd); |
|
} else { |
|
throw new Exception("Uknown type " + type); |
|
} |
|
|
|
// Terminal found or real end was reached; |
|
bool finished = currentLocation < fragmentEnd || IsEndOfFile(); |
|
|
|
if (!finished) { |
|
// We have to continue reading more text fragments |
|
|
|
// If there is entity reference, make sure the next segment starts with it to prevent framentation |
|
int entitySearchStart = Math.Max(start + 1 /* data for us */, currentLocation - maxEntityLength); |
|
int entitySearchLength = currentLocation - entitySearchStart; |
|
if (entitySearchLength > 0) { |
|
// Note that LastIndexOf works backward |
|
int entityIndex = input.LastIndexOf('&', currentLocation - 1, entitySearchLength); |
|
if (entityIndex != -1) { |
|
GoBack(entityIndex); |
|
} |
|
} |
|
} |
|
|
|
text.EscapedValue = GetText(start, currentLocation); |
|
if (type == RawTextType.CharacterData) { |
|
text.Value = Dereference(text, text.EscapedValue, start); |
|
} else { |
|
text.Value = text.EscapedValue; |
|
} |
|
text.EndOffset = currentLocation; |
|
|
|
if (text.EscapedValue.Length > 0) { |
|
OnParsed(text); |
|
yield return text; |
|
} |
|
|
|
if (finished) { |
|
yield break; |
|
} |
|
} |
|
} |
|
|
|
#region Helper methods |
|
|
|
static bool IsValidName(string name) |
|
{ |
|
try { |
|
System.Xml.XmlConvert.VerifyName(name); |
|
return true; |
|
} catch (System.Xml.XmlException) { |
|
return false; |
|
} |
|
} |
|
|
|
/// <summary> Remove quoting from the given string </summary> |
|
static string Unquote(string quoted) |
|
{ |
|
if (string.IsNullOrEmpty(quoted)) return string.Empty; |
|
char first = quoted[0]; |
|
if (quoted.Length == 1) return (first == '"' || first == '\'') ? string.Empty : quoted; |
|
char last = quoted[quoted.Length - 1]; |
|
if (first == '"' || first == '\'') { |
|
if (first == last) { |
|
// Remove both quotes |
|
return quoted.Substring(1, quoted.Length - 2); |
|
} else { |
|
// Remove first quote |
|
return quoted.Remove(0, 1); |
|
} |
|
} else { |
|
if (last == '"' || last == '\'') { |
|
// Remove last quote |
|
return quoted.Substring(0, quoted.Length - 1); |
|
} else { |
|
// Keep whole string |
|
return quoted; |
|
} |
|
} |
|
} |
|
|
|
string Dereference(RawObject owner, string text, int textLocation) |
|
{ |
|
StringBuilder sb = null; // The dereferenced text so far (all up to 'curr') |
|
int curr = 0; |
|
while(true) { |
|
// Reached end of input |
|
if (curr == text.Length) { |
|
if (sb != null) { |
|
return sb.ToString(); |
|
} else { |
|
return text; |
|
} |
|
} |
|
|
|
// Try to find reference |
|
int start = text.IndexOf('&', curr); |
|
|
|
// No more references found |
|
if (start == -1) { |
|
if (sb != null) { |
|
sb.Append(text, curr, text.Length - curr); // Add rest |
|
return sb.ToString(); |
|
} else { |
|
return text; |
|
} |
|
} |
|
|
|
// Append text before the enitiy reference |
|
if (sb == null) sb = new StringBuilder(text.Length); |
|
sb.Append(text, curr, start - curr); |
|
curr = start; |
|
|
|
// Process the entity |
|
int errorLoc = textLocation + sb.Length; |
|
|
|
// Find entity name |
|
int end = text.IndexOfAny(new char[] {'&', ';'}, start + 1, Math.Min(maxEntityLength, text.Length - (start + 1))); |
|
if (end == -1 || text[end] == '&') { |
|
// Not found |
|
OnSyntaxError(owner, errorLoc, errorLoc + 1, "Entity reference must be terminated with ';'"); |
|
// Keep '&' |
|
sb.Append('&'); |
|
curr++; |
|
continue; // Restart and next character location |
|
} |
|
string name = text.Substring(start + 1, end - (start + 1)); |
|
|
|
// Resolve the name |
|
string replacement; |
|
if (name == "amp") { |
|
replacement = "&"; |
|
} else if (name == "lt") { |
|
replacement = "<"; |
|
} else if (name == "gt") { |
|
replacement = ">"; |
|
} else if (name == "apos") { |
|
replacement = "'"; |
|
} else if (name == "quot") { |
|
replacement = "\""; |
|
} else if (name.Length > 0 && name[0] == '#') { |
|
int num; |
|
if (name.Length > 1 && name[1] == 'x') { |
|
if (!int.TryParse(name.Substring(2), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture.NumberFormat, out num)) { |
|
num = -1; |
|
OnSyntaxError(owner, errorLoc + 3, errorLoc + 1 + name.Length, "Hexadecimal code of unicode character expected"); |
|
} |
|
} else { |
|
if (!int.TryParse(name.Substring(1), NumberStyles.None, CultureInfo.InvariantCulture.NumberFormat, out num)) { |
|
num = -1; |
|
OnSyntaxError(owner, errorLoc + 2, errorLoc + 1 + name.Length, "Numeric code of unicode character expected"); |
|
} |
|
} |
|
if (num != -1) { |
|
try { |
|
replacement = char.ConvertFromUtf32(num); |
|
} catch (ArgumentOutOfRangeException) { |
|
replacement = null; |
|
OnSyntaxError(owner, errorLoc + 2, errorLoc + 1 + name.Length, "Invalid unicode character U+{0:X} ({0})", num); |
|
} |
|
} else { |
|
replacement = null; |
|
} |
|
} else { |
|
replacement = null; |
|
if (this.EntityReferenceIsError) { |
|
OnSyntaxError(owner, errorLoc, errorLoc + 1 + name.Length + 1, "Unknown entity reference '{0}'", name); |
|
} |
|
} |
|
|
|
// Append the replacement to output |
|
if (replacement != null) { |
|
sb.Append(replacement); |
|
} else { |
|
sb.Append('&'); |
|
sb.Append(name); |
|
sb.Append(';'); |
|
} |
|
curr = end + 1; |
|
continue; |
|
} |
|
} |
|
|
|
#endregion |
|
|
|
#region Object stream reading |
|
|
|
string PrintObjects(IEnumerable<RawObject> objs) |
|
{ |
|
StringBuilder sb = new StringBuilder(); |
|
foreach(RawObject obj in objs) { |
|
if (obj is RawTag) { |
|
if (obj == StartTagPlaceholder) { |
|
sb.Append("#StartTag#"); |
|
} else if (obj == EndTagPlaceholder) { |
|
sb.Append("#EndTag#"); |
|
} else { |
|
sb.Append(((RawTag)obj).OpeningBracket); |
|
sb.Append(((RawTag)obj).Name); |
|
sb.Append(((RawTag)obj).ClosingBracket); |
|
} |
|
} else if (obj is RawElement) { |
|
sb.Append('['); |
|
sb.Append(PrintObjects(((RawElement)obj).Children)); |
|
sb.Append(']'); |
|
} else if (obj is RawText) { |
|
sb.Append('~'); |
|
} else { |
|
throw new Exception("Should not be here: " + obj); |
|
} |
|
} |
|
return sb.ToString(); |
|
} |
|
|
|
RawDocument ReadDocument() |
|
{ |
|
RawDocument doc = new RawDocument() { Parser = this }; |
|
|
|
List<RawObject> objs = ReadFlatObjects(); |
|
Log("Flat stream: {0}", PrintObjects(objs)); |
|
List<RawObject> valid = MatchTags(objs); |
|
Log("Fixed stream: {0}", PrintObjects(valid)); |
|
IEnumerator<RawObject> validStream = valid.GetEnumerator(); |
|
validStream.MoveNext(); // Move to first |
|
while(true) { |
|
// End of stream? |
|
try { |
|
if (validStream.Current == null) break; |
|
} catch (InvalidCastException) { |
|
break; |
|
} |
|
doc.AddChild(ReadTextOrElement(validStream)); |
|
} |
|
|
|
if (doc.Children.Count > 0) { |
|
doc.StartOffset = doc.FirstChild.StartOffset; |
|
doc.EndOffset = doc.LastChild.EndOffset; |
|
} |
|
|
|
OnParsed(doc); |
|
return doc; |
|
} |
|
|
|
RawObject ReadSingleObject(IEnumerator<RawObject> objStream) |
|
{ |
|
RawObject obj = objStream.Current; |
|
objStream.MoveNext(); |
|
return obj; |
|
} |
|
|
|
RawObject ReadTextOrElement(IEnumerator<RawObject> objStream) |
|
{ |
|
AssertIsEndOfFile(); |
|
|
|
RawObject curr = objStream.Current; |
|
if (curr is RawText || curr is RawElement) { |
|
return ReadSingleObject(objStream); |
|
} else { |
|
RawTag currTag = (RawTag)curr; |
|
if (currTag == StartTagPlaceholder) { |
|
return ReadElement(objStream); |
|
} else if (currTag.IsStartOrEmptyTag) { |
|
return ReadElement(objStream); |
|
} else { |
|
return ReadSingleObject(objStream); |
|
} |
|
} |
|
} |
|
|
|
RawElement ReadElement(IEnumerator<RawObject> objStream) |
|
{ |
|
AssertIsEndOfFile(); |
|
|
|
RawElement element = new RawElement(); |
|
element.IsProperlyNested = true; |
|
|
|
// Read start tag |
|
RawTag startTag = ReadSingleObject(objStream) as RawTag; |
|
DebugAssert(startTag != null, "Start tag expected"); |
|
DebugAssert(startTag.IsStartOrEmptyTag || startTag == StartTagPlaceholder, "Start tag expected"); |
|
if (startTag == StartTagPlaceholder) { |
|
element.HasStartOrEmptyTag = false; |
|
element.IsProperlyNested = false; |
|
OnSyntaxError(element, objStream.Current.StartOffset, objStream.Current.EndOffset, |
|
"Matching openning tag was not found"); |
|
} else { |
|
element.HasStartOrEmptyTag = true; |
|
element.AddChild(startTag); |
|
} |
|
|
|
// Read content and end tag |
|
if (element.StartTag.IsStartTag || startTag == StartTagPlaceholder) { |
|
while(true) { |
|
RawTag currTag = objStream.Current as RawTag; // Peek |
|
if (currTag == EndTagPlaceholder) { |
|
OnSyntaxError(element, element.LastChild.EndOffset, element.LastChild.EndOffset, |
|
"Expected '</{0}>'", element.StartTag.Name); |
|
ReadSingleObject(objStream); |
|
element.HasEndTag = false; |
|
element.IsProperlyNested = false; |
|
break; |
|
} else if (currTag != null && currTag.IsEndTag) { |
|
if (currTag.Name != element.StartTag.Name) { |
|
OnSyntaxError(element, currTag.StartOffset + 2, currTag.StartOffset + 2 + currTag.Name.Length, |
|
"Expected '{0}'. End tag must have same name as start tag.", element.StartTag.Name); |
|
} |
|
element.AddChild(ReadSingleObject(objStream)); |
|
element.HasEndTag = true; |
|
break; |
|
} |
|
RawObject nested = ReadTextOrElement(objStream); |
|
if (nested is RawElement) { |
|
if (!((RawElement)nested).IsProperlyNested) |
|
element.IsProperlyNested = false; |
|
element.AddChildren(Split((RawElement)nested).ToList()); |
|
} else { |
|
element.AddChild(nested); |
|
} |
|
} |
|
} else { |
|
element.HasEndTag = false; |
|
} |
|
|
|
element.StartOffset = element.FirstChild.StartOffset; |
|
element.EndOffset = element.LastChild.EndOffset; |
|
|
|
OnParsed(element); // Need all elements in cache for offset tracking |
|
return element; |
|
} |
|
|
|
IEnumerable<RawObject> Split(RawElement elem) |
|
{ |
|
int myIndention = GetIndentLevel(elem); |
|
// If has virtual end and is indented |
|
if (!elem.HasEndTag && myIndention != -1) { |
|
int lastAccepted = 0; // Accept start tag |
|
while (lastAccepted + 1 < elem.Children.Count - 1 /* no end tag */) { |
|
RawObject nextItem = elem.Children[lastAccepted + 1]; |
|
if (nextItem is RawText) { |
|
lastAccepted++; continue; // Accept |
|
} else { |
|
// Include all more indented items |
|
if (GetIndentLevel(nextItem) > myIndention) { |
|
lastAccepted++; continue; // Accept |
|
} else { |
|
break; // Reject |
|
} |
|
} |
|
} |
|
// Accepted everything? |
|
if (lastAccepted + 1 == elem.Children.Count - 1) { |
|
yield return elem; |
|
yield break; |
|
} |
|
Log("Splitting {0} - take {1} of {2} nested", elem, lastAccepted, elem.Children.Count - 2); |
|
RawElement topHalf = new RawElement(); |
|
topHalf.HasStartOrEmptyTag = elem.HasStartOrEmptyTag; |
|
topHalf.HasEndTag = elem.HasEndTag; |
|
topHalf.AddChildren(elem.Children.Take(lastAccepted + 1)); // Start tag + nested |
|
topHalf.StartOffset = topHalf.FirstChild.StartOffset; |
|
topHalf.EndOffset = topHalf.LastChild.EndOffset; |
|
OnSyntaxError(topHalf, topHalf.LastChild.EndOffset, topHalf.LastChild.EndOffset, |
|
"Expected '</{0}>'", topHalf.StartTag.Name); |
|
|
|
Log("Constructed {0}", topHalf); |
|
yield return topHalf; |
|
for(int i = lastAccepted + 1; i < elem.Children.Count - 1; i++) { |
|
yield return elem.Children[i]; |
|
} |
|
} else { |
|
yield return elem; |
|
} |
|
} |
|
|
|
int GetIndentLevel(RawObject obj) |
|
{ |
|
int offset = obj.StartOffset - 1; |
|
int level = 0; |
|
while(true) { |
|
if (offset < 0) break; |
|
char c = input[offset]; |
|
if (c == ' ') { |
|
level++; |
|
} else if (c == '\t') { |
|
level += 4; |
|
} else if (c == '\r' || c == '\n') { |
|
break; |
|
} else { |
|
return -1; |
|
} |
|
offset--; |
|
} |
|
return level; |
|
} |
|
|
|
#endregion |
|
|
|
#region Matching heuristics |
|
|
|
const int maxConfigurationCount = 10; |
|
|
|
/// <summary> |
|
/// Stack of still unmatched start tags. |
|
/// It includes the cost and backtack information. |
|
/// </summary> |
|
class Configuration |
|
{ |
|
/// <summary> Unmatched start tags </summary> |
|
public ImmutableStack<RawTag> StartTags { get; set; } |
|
/// <summary> Properly nested tags </summary> |
|
public ImmutableStack<RawObject> Document { get; set; } |
|
/// <summary> Number of needed modificaitons to the document </summary> |
|
public int Cost { get; set; } |
|
} |
|
|
|
/// <summary> |
|
/// Dictionary which stores the cheapest configuration |
|
/// </summary> |
|
class Configurations: Dictionary<ImmutableStack<RawTag>, Configuration> |
|
{ |
|
public Configurations() |
|
{ |
|
} |
|
|
|
public Configurations(IEnumerable<Configuration> configs) |
|
{ |
|
foreach(Configuration config in configs) { |
|
this.Add(config); |
|
} |
|
} |
|
|
|
/// <summary> Overwrite only if cheaper </summary> |
|
public void Add(Configuration newConfig) |
|
{ |
|
Configuration oldConfig; |
|
if (this.TryGetValue(newConfig.StartTags, out oldConfig)) { |
|
if (newConfig.Cost < oldConfig.Cost) { |
|
this[newConfig.StartTags] = newConfig; |
|
} |
|
} else { |
|
base.Add(newConfig.StartTags, newConfig); |
|
} |
|
} |
|
|
|
public override string ToString() |
|
{ |
|
StringBuilder sb = new StringBuilder(); |
|
foreach(var kvp in this) { |
|
sb.Append("\n - '"); |
|
foreach(RawTag startTag in kvp.Value.StartTags.Reverse()) { |
|
sb.Append('<'); |
|
sb.Append(startTag.Name); |
|
sb.Append('>'); |
|
} |
|
sb.AppendFormat("' = {0}", kvp.Value.Cost); |
|
} |
|
return sb.ToString(); |
|
} |
|
} |
|
|
|
// Tags used to guide the element creation |
|
readonly RawTag StartTagPlaceholder = new RawTag(); |
|
readonly RawTag EndTagPlaceholder = new RawTag(); |
|
|
|
/// <summary> |
|
/// Add start or end tag placeholders so that the documment is properly nested |
|
/// </summary> |
|
List<RawObject> MatchTags(IEnumerable<RawObject> objs) |
|
{ |
|
Configurations configurations = new Configurations(); |
|
configurations.Add(new Configuration { |
|
StartTags = ImmutableStack<RawTag>.Empty, |
|
Document = ImmutableStack<RawObject>.Empty, |
|
Cost = 0, |
|
}); |
|
foreach(RawObject obj in objs) { |
|
configurations = ProcessObject(configurations, obj); |
|
} |
|
// Close any remaining start tags |
|
foreach(Configuration conifg in configurations.Values) { |
|
while(!conifg.StartTags.IsEmpty) { |
|
conifg.StartTags = conifg.StartTags.Pop(); |
|
conifg.Document = conifg.Document.Push(EndTagPlaceholder); |
|
conifg.Cost += 1; |
|
} |
|
} |
|
Log("Configurations after closing all remaining tags:" + configurations.ToString()); |
|
Configuration bestConfig = configurations.Values.OrderBy(v => v.Cost).First(); |
|
Log("Best configuration has cost {0}", bestConfig.Cost); |
|
|
|
return bestConfig.Document.Reverse().ToList(); |
|
} |
|
|
|
/// <summary> Get posible configurations after considering fiven object </summary> |
|
Configurations ProcessObject(Configurations oldConfigs, RawObject obj) |
|
{ |
|
Log("Processing {0}", obj); |
|
|
|
RawTag tag = obj as RawTag; |
|
Assert(obj is RawTag || obj is RawText || obj is RawElement, obj.GetType().Name + " not expected"); |
|
if (obj is RawElement) |
|
Assert(((RawElement)obj).IsProperlyNested, "Element not proprly nested"); |
|
|
|
Configurations newConfigs = new Configurations(); |
|
|
|
foreach(var kvp in oldConfigs) { |
|
Configuration oldConfig = kvp.Value; |
|
var oldStartTags = oldConfig.StartTags; |
|
var oldDocument = oldConfig.Document; |
|
int oldCost = oldConfig.Cost; |
|
|
|
if (tag != null && tag.IsStartTag) { |
|
newConfigs.Add(new Configuration { // Push start-tag (cost 0) |
|
StartTags = oldStartTags.Push(tag), |
|
Document = oldDocument.Push(tag), |
|
Cost = oldCost, |
|
}); |
|
} else if (tag != null && tag.IsEndTag) { |
|
newConfigs.Add(new Configuration { // Ignore (cost 1) |
|
StartTags = oldStartTags, |
|
Document = oldDocument.Push(StartTagPlaceholder).Push(tag), |
|
Cost = oldCost + 1, |
|
}); |
|
if (!oldStartTags.IsEmpty && oldStartTags.Peek().Name != tag.Name) { |
|
newConfigs.Add(new Configuration { // Pop 1 item (cost 1) - not mathcing |
|
StartTags = oldStartTags.Pop(), |
|
Document = oldDocument.Push(tag), |
|
Cost = oldCost + 1, |
|
}); |
|
} |
|
int popedCount = 0; |
|
var startTags = oldStartTags; |
|
var doc = oldDocument; |
|
foreach(RawTag poped in oldStartTags) { |
|
popedCount++; |
|
if (poped.Name == tag.Name) { |
|
newConfigs.Add(new Configuration { // Pop 'x' items (cost x-1) - last one is matching |
|
StartTags = startTags.Pop(), |
|
Document = doc.Push(tag), |
|
Cost = oldCost + popedCount - 1, |
|
}); |
|
} |
|
startTags = startTags.Pop(); |
|
doc = doc.Push(EndTagPlaceholder); |
|
} |
|
} else { |
|
// Empty tag or other tag type or text or properly nested element |
|
newConfigs.Add(new Configuration { // Ignore (cost 0) |
|
StartTags = oldStartTags, |
|
Document = oldDocument.Push(obj), |
|
Cost = oldCost, |
|
}); |
|
} |
|
} |
|
|
|
// Log("New configurations:" + newConfigs.ToString()); |
|
|
|
Configurations bestNewConfigurations = new Configurations( |
|
newConfigs.Values.OrderBy(v => v.Cost).Take(maxConfigurationCount) |
|
); |
|
|
|
Log("Best new configurations:" + bestNewConfigurations.ToString()); |
|
|
|
return bestNewConfigurations; |
|
} |
|
|
|
#endregion |
|
} |
|
}
|
|
|