// Copyright (c) AlphaSierraPapa for the SharpDevelop Team (for details please see \doc\copyright.txt) // This code is distributed under the GNU LGPL (for details please see \doc\license.txt) using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text; namespace ICSharpCode.AvalonEdit.Xml { class TagReader: TokenReader { AXmlParser parser; TrackedSegmentCollection trackedSegments; string input; public TagReader(AXmlParser parser, string input): base(input) { this.parser = parser; this.trackedSegments = parser.TrackedSegments; this.input = input; } bool TryReadFromCacheOrNew(out T res) where T: AXmlObject, new() { return TryReadFromCacheOrNew(out res, t => true); } bool TryReadFromCacheOrNew(out T res, Predicate condition) where T: AXmlObject, new() { T cached = trackedSegments.GetCachedObject(this.CurrentLocation, 0, condition); if (cached != null) { Skip(cached.Length); AXmlParser.Assert(cached.Length > 0, "cached elements must not have zero length"); res = cached; return true; } else { res = new T(); return false; } } void OnParsed(AXmlObject obj) { AXmlParser.Log("Parsed {0}", obj); trackedSegments.AddParsedObject(obj, this.MaxTouchedLocation > this.CurrentLocation ? (int?)this.MaxTouchedLocation : null); } /// /// Read all tags in the document in a flat sequence. /// It also includes the text between tags and possibly some properly nested Elements from cache. /// public List ReadAllTags() { List stream = new List(); while(true) { if (IsEndOfFile()) { break; } else if (TryPeek('<')) { AXmlElement elem; if (TryReadFromCacheOrNew(out elem, e => e.IsProperlyNested)) { stream.Add(elem); } else { stream.Add(ReadTag()); } } else { stream.AddRange(ReadText(TextType.CharacterData)); } } return stream; } /// /// Context: "<" /// AXmlTag ReadTag() { AssertHasMoreData(); AXmlTag tag; if (TryReadFromCacheOrNew(out tag)) return tag; tag.StartOffset = this.CurrentLocation; // Read the opening bracket // It identifies the type of tag and parsing behavior for the rest of it tag.OpeningBracket = ReadOpeningBracket(); if (tag.IsUnknownBang && !TryPeekWhiteSpace()) OnSyntaxError(tag, tag.StartOffset, this.CurrentLocation, "Unknown tag"); if (tag.IsStartOrEmptyTag || tag.IsEndTag || tag.IsProcessingInstruction) { // Read the name string name; if (TryReadName(out name)) { if (!IsValidName(name)) { OnSyntaxError(tag, this.CurrentLocation - name.Length, this.CurrentLocation, "The name '{0}' is invalid", name); } } else { OnSyntaxError(tag, "Element name expected"); } tag.Name = name; } else { tag.Name = string.Empty; } bool isXmlDeclr = tag.StartOffset == 0 && tag.Name == "xml"; if (tag.IsStartOrEmptyTag || tag.IsEndTag || isXmlDeclr) { // Read attributes for the tag while(true) { // Chech for all forbiden 'name' charcters first - see ReadName if (IsEndOfFile()) break; if (TryPeekWhiteSpace()) { tag.AddChildren(ReadText(TextType.WhiteSpace)); continue; // End of file might be next } if (TryPeek('<')) break; string endBr; int endBrStart = this.CurrentLocation; // Just peek if (TryReadClosingBracket(out endBr)) { // End tag GoBack(endBrStart); break; } // We have "=\'\"" or name - read attribute AXmlAttribute attr = ReadAttribulte(); tag.AddChild(attr); if (tag.IsEndTag) OnSyntaxError(tag, attr.StartOffset, attr.EndOffset, "Attribute not allowed in end tag."); } } else if (tag.IsDocumentType) { tag.AddChildren(ReadContentOfDTD()); } else { int start = this.CurrentLocation; IEnumerable text; if (tag.IsComment) { text = ReadText(TextType.Comment); } else if (tag.IsCData) { text = ReadText(TextType.CData); } else if (tag.IsProcessingInstruction) { text = ReadText(TextType.ProcessingInstruction); } else if (tag.IsUnknownBang) { text = ReadText(TextType.UnknownBang); } else { throw new InternalException(string.Format(CultureInfo.InvariantCulture, "Unknown opening bracket '{0}'", tag.OpeningBracket)); } // Enumerate text = text.ToList(); // Backtrack at complete start if (IsEndOfFile() || (tag.IsUnknownBang && TryPeek('<'))) { GoBack(start); } else { tag.AddChildren(text); } } // Read closing bracket string bracket; TryReadClosingBracket(out bracket); tag.ClosingBracket = bracket; // Error check int brStart = this.CurrentLocation - (tag.ClosingBracket ?? string.Empty).Length; int brEnd = this.CurrentLocation; if (tag.Name == null) { // One error was reported already } else if (tag.IsStartOrEmptyTag) { if (tag.ClosingBracket != ">" && tag.ClosingBracket != "/>") OnSyntaxError(tag, brStart, brEnd, "'>' or '/>' expected"); } else if (tag.IsEndTag) { if (tag.ClosingBracket != ">") OnSyntaxError(tag, brStart, brEnd, "'>' expected"); } else if (tag.IsComment) { if (tag.ClosingBracket != "-->") OnSyntaxError(tag, brStart, brEnd, "'-->' expected"); } else if (tag.IsCData) { if (tag.ClosingBracket != "]]>") OnSyntaxError(tag, brStart, brEnd, "']]>' expected"); } else if (tag.IsProcessingInstruction) { if (tag.ClosingBracket != "?>") OnSyntaxError(tag, brStart, brEnd, "'?>' expected"); } else if (tag.IsUnknownBang) { if (tag.ClosingBracket != ">") OnSyntaxError(tag, brStart, brEnd, "'>' expected"); } else if (tag.IsDocumentType) { if (tag.ClosingBracket != ">") OnSyntaxError(tag, brStart, brEnd, "'>' expected"); } else { throw new InternalException(string.Format(CultureInfo.InvariantCulture, "Unknown opening bracket '{0}'", tag.OpeningBracket)); } // Attribute name may not apper multiple times var duplicates = tag.Children.OfType().GroupBy(attr => attr.Name).SelectMany(g => g.Skip(1)); foreach(AXmlAttribute attr in duplicates) { OnSyntaxError(tag, attr.StartOffset, attr.EndOffset, "Attribute with name '{0}' already exists", attr.Name); } tag.EndOffset = this.CurrentLocation; OnParsed(tag); return tag; } /// /// Reads any of the know opening brackets. (only full bracket) /// Context: "<" /// string ReadOpeningBracket() { // We are using a lot of string literals so that the memory instances are shared //int start = this.CurrentLocation; if (TryRead('<')) { if (TryRead('/')) { return " /// Reads any of the know closing brackets. (only full bracket) /// Context: any /// bool TryReadClosingBracket(out string bracket) { // We are using a lot of string literals so that the memory instances are shared if (TryRead('>')) { bracket = ">"; } else if (TryRead("/>")) { bracket = "/>"; } else if (TryRead("?>")) { bracket = "?>"; } else if (TryRead("-->")) { bracket = "-->"; } else if (TryRead("]]>")) { bracket = "]]>"; } else { bracket = string.Empty; return false; } return true; } IEnumerable ReadContentOfDTD() { int start = this.CurrentLocation; while(true) { if (IsEndOfFile()) break; // End of file TryMoveToNonWhiteSpace(); // Skip whitespace if (TryRead('\'')) TryMoveTo('\''); // Skip single quoted string TODO: Bug if (TryRead('\"')) TryMoveTo('\"'); // Skip single quoted string if (TryRead('[')) { // Start of nested infoset // Reading infoset while(true) { if (IsEndOfFile()) break; TryMoveToAnyOf('<', ']'); if (TryPeek('<')) { if (start != this.CurrentLocation) { // Two following tags yield return MakeText(start, this.CurrentLocation); } yield return ReadTag(); start = this.CurrentLocation; } if (TryPeek(']')) break; } } TryRead(']'); // End of nested infoset if (TryPeek('>')) break; // Proper closing if (TryPeek('<')) break; // Malformed XML TryMoveNext(); // Skip anything else } if (start != this.CurrentLocation) { yield return MakeText(start, this.CurrentLocation); } } /// /// Context: name or "=\'\"" /// AXmlAttribute ReadAttribulte() { AssertHasMoreData(); AXmlAttribute attr; if (TryReadFromCacheOrNew(out attr)) return attr; attr.StartOffset = this.CurrentLocation; // Read name string name; if (TryReadName(out name)) { if (!IsValidName(name)) { OnSyntaxError(attr, this.CurrentLocation - name.Length, this.CurrentLocation, "The name '{0}' is invalid", name); } } else { OnSyntaxError(attr, "Attribute name expected"); } attr.Name = name; // Read equals sign and surrounding whitespace int checkpoint = this.CurrentLocation; TryMoveToNonWhiteSpace(); if (TryRead('=')) { int chk2 = this.CurrentLocation; TryMoveToNonWhiteSpace(); if (!TryPeek('"') && !TryPeek('\'')) { // Do not read whitespace if quote does not follow GoBack(chk2); } attr.EqualsSign = GetText(checkpoint, this.CurrentLocation); } else { GoBack(checkpoint); OnSyntaxError(attr, "'=' expected"); attr.EqualsSign = string.Empty; } // Read attribute value int start = this.CurrentLocation; char quoteChar = TryPeek('"') ? '"' : '\''; bool startsWithQuote; if (TryRead(quoteChar)) { startsWithQuote = true; int valueStart = this.CurrentLocation; TryMoveToAnyOf(quoteChar, '<'); if (TryRead(quoteChar)) { if (!TryPeekAnyOf(' ', '\t', '\n', '\r', '/', '>', '?')) { if (TryPeekPrevious('=', 2) || (TryPeekPrevious('=', 3) && TryPeekPrevious(' ', 2))) { // This actually most likely means that we are in the next attribute value GoBack(valueStart); ReadAttributeValue(quoteChar); if (TryRead(quoteChar)) { OnSyntaxError(attr, "White space or end of tag expected"); } else { OnSyntaxError(attr, "Quote {0} expected (or add whitespace after the following one)", quoteChar); } } else { OnSyntaxError(attr, "White space or end of tag expected"); } } } else { // '<' or end of file GoBack(valueStart); ReadAttributeValue(quoteChar); OnSyntaxError(attr, "Quote {0} expected", quoteChar); } } else { startsWithQuote = false; int valueStart = this.CurrentLocation; ReadAttributeValue(null); TryRead('\"'); TryRead('\''); if (valueStart == this.CurrentLocation) { OnSyntaxError(attr, "Attribute value expected"); } else { OnSyntaxError(attr, valueStart, this.CurrentLocation, "Attribute value must be quoted"); } } attr.QuotedValue = GetText(start, this.CurrentLocation); attr.Value = Unquote(attr.QuotedValue); attr.Value = Dereference(attr, attr.Value, startsWithQuote ? start + 1 : start); attr.EndOffset = this.CurrentLocation; OnParsed(attr); return attr; } /// /// Read everything up to quote (excluding), opening/closing tag or attribute signature /// void ReadAttributeValue(char? quote) { while(true) { if (IsEndOfFile()) return; // What is next? int start = this.CurrentLocation; TryMoveToNonWhiteSpace(); // Read white space (if any) if (quote.HasValue) { if (TryPeek(quote.Value)) return; } else { if (TryPeek('"') || TryPeek('\'')) return; } // Opening/closing tag string endBr; if (TryPeek('<') || TryReadClosingBracket(out endBr)) { GoBack(start); return; } // Try reading attribute signature string name; if (TryReadName(out name)) { int nameEnd = this.CurrentLocation; if (TryMoveToNonWhiteSpace() && TryRead("=") && TryMoveToNonWhiteSpace() && TryPeekAnyOf('"', '\'')) { // Start of attribute. Great GoBack(start); return; // Done } else { // Just some gargabe - make it part of the value GoBack(nameEnd); continue; // Read more } } TryMoveNext(); // Accept everyting else } } AXmlText MakeText(int start, int end) { AXmlParser.DebugAssert(end > start, "Empty text"); AXmlText text = new AXmlText() { StartOffset = start, EndOffset = end, EscapedValue = GetText(start, end), Type = TextType.Other }; OnParsed(text); return text; } const int maxEntityLength = 16; // The longest build-in one is 10 ("􏿿") const int maxTextFragmentSize = 64; const int lookAheadLength = (3 * maxTextFragmentSize) / 2; // More so that we do not get small "what was inserted" fragments /// /// Reads text and optionaly separates it into fragments. /// It can also return empty set for no appropriate text input. /// Make sure you enumerate it only once /// IEnumerable ReadText(TextType type) { bool lookahead = false; while(true) { AXmlText text; if (TryReadFromCacheOrNew(out text, t => t.Type == type)) { // Cached text found yield return text; continue; // Read next fragment; the method can handle "no text left" } text.Type = type; // Limit the reading to just a few characters // (the first character not to be read) int fragmentEnd = Math.Min(this.CurrentLocation + maxTextFragmentSize, this.InputLength); // Look if some futher text has been already processed and align so that // we hit that chache point. It is expensive so it is off for the first run if (lookahead) { // Note: Must fit entity AXmlObject nextFragment = trackedSegments.GetCachedObject(this.CurrentLocation + maxEntityLength, lookAheadLength - maxEntityLength, t => t.Type == type); if (nextFragment != null) { fragmentEnd = Math.Min(nextFragment.StartOffset, this.InputLength); AXmlParser.Log("Parsing only text ({0}-{1}) because later text was already processed", this.CurrentLocation, fragmentEnd); } } lookahead = true; text.StartOffset = this.CurrentLocation; int start = this.CurrentLocation; // Whitespace would be skipped anyway by any operation TryMoveToNonWhiteSpace(fragmentEnd); int wsEnd = this.CurrentLocation; // Try move to the terminator given by the context if (type == TextType.WhiteSpace) { TryMoveToNonWhiteSpace(fragmentEnd); } else if (type == TextType.CharacterData) { while(true) { if (!TryMoveToAnyOf(new char[] {'<', ']'}, fragmentEnd)) break; // End of fragment if (TryPeek('<')) break; if (TryPeek(']')) { if (TryPeek("]]>")) { OnSyntaxError(text, this.CurrentLocation, this.CurrentLocation + 3, "']]>' is not allowed in text"); } TryMoveNext(); continue; } throw new Exception("Infinite loop"); } } else if (type == TextType.Comment) { // Do not report too many errors bool errorReported = false; while(true) { if (!TryMoveTo('-', fragmentEnd)) break; // End of fragment if (TryPeek("-->")) break; if (TryPeek("--") && !errorReported) { OnSyntaxError(text, this.CurrentLocation, this.CurrentLocation + 2, "'--' is not allowed in comment"); errorReported = true; } TryMoveNext(); } } else if (type == TextType.CData) { while(true) { // We can not use use TryMoveTo("]]>", fragmentEnd) because it may incorectly accept "]" at the end of fragment if (!TryMoveTo(']', fragmentEnd)) break; // End of fragment if (TryPeek("]]>")) break; TryMoveNext(); } } else if (type == TextType.ProcessingInstruction) { while(true) { if (!TryMoveTo('?', fragmentEnd)) break; // End of fragment if (TryPeek("?>")) break; TryMoveNext(); } } else if (type == TextType.UnknownBang) { TryMoveToAnyOf(new char[] {'<', '>'}, fragmentEnd); } else { throw new Exception("Uknown type " + type); } text.ContainsOnlyWhitespace = (wsEnd == this.CurrentLocation); // Terminal found or real end was reached; bool finished = this.CurrentLocation < fragmentEnd || IsEndOfFile(); if (!finished) { // We have to continue reading more text fragments // If there is entity reference, make sure the next segment starts with it to prevent framentation int entitySearchStart = Math.Max(start + 1 /* data for us */, this.CurrentLocation - maxEntityLength); int entitySearchLength = this.CurrentLocation - entitySearchStart; if (entitySearchLength > 0) { // Note that LastIndexOf works backward int entityIndex = input.LastIndexOf('&', this.CurrentLocation - 1, entitySearchLength); if (entityIndex != -1) { GoBack(entityIndex); } } } text.EscapedValue = GetText(start, this.CurrentLocation); if (type == TextType.CharacterData) { // Normalize end of line first text.Value = Dereference(text, NormalizeEndOfLine(text.EscapedValue), start); } else { text.Value = text.EscapedValue; } text.EndOffset = this.CurrentLocation; if (text.EscapedValue.Length > 0) { OnParsed(text); yield return text; } if (finished) { yield break; } } } #region Helper methods void OnSyntaxError(AXmlObject obj, string message, params object[] args) { OnSyntaxError(obj, this.CurrentLocation, this.CurrentLocation + 1, message, args); } public static void OnSyntaxError(AXmlObject obj, int start, int end, string message, params object[] args) { if (end <= start) end = start + 1; string formattedMessage = string.Format(CultureInfo.InvariantCulture, message, args); AXmlParser.Log("Syntax error ({0}-{1}): {2}", start, end, formattedMessage); obj.AddSyntaxError(new SyntaxError() { Object = obj, StartOffset = start, EndOffset = end, Message = formattedMessage, }); } static bool IsValidName(string name) { try { System.Xml.XmlConvert.VerifyName(name); return true; } catch (System.Xml.XmlException) { return false; } } /// Remove quoting from the given string static string Unquote(string quoted) { if (string.IsNullOrEmpty(quoted)) return string.Empty; char first = quoted[0]; if (quoted.Length == 1) return (first == '"' || first == '\'') ? string.Empty : quoted; char last = quoted[quoted.Length - 1]; if (first == '"' || first == '\'') { if (first == last) { // Remove both quotes return quoted.Substring(1, quoted.Length - 2); } else { // Remove first quote return quoted.Remove(0, 1); } } else { if (last == '"' || last == '\'') { // Remove last quote return quoted.Substring(0, quoted.Length - 1); } else { // Keep whole string return quoted; } } } static string NormalizeEndOfLine(string text) { return text.Replace("\r\n", "\n").Replace("\r", "\n"); } string Dereference(AXmlObject owner, string text, int textLocation) { StringBuilder sb = null; // The dereferenced text so far (all up to 'curr') int curr = 0; while(true) { // Reached end of input if (curr == text.Length) { if (sb != null) { return sb.ToString(); } else { return text; } } // Try to find reference int start = text.IndexOf('&', curr); // No more references found if (start == -1) { if (sb != null) { sb.Append(text, curr, text.Length - curr); // Add rest return sb.ToString(); } else { return text; } } // Append text before the enitiy reference if (sb == null) sb = new StringBuilder(text.Length); sb.Append(text, curr, start - curr); curr = start; // Process the entity int errorLoc = textLocation + sb.Length; // Find entity name int end = text.IndexOfAny(new char[] {'&', ';'}, start + 1, Math.Min(maxEntityLength, text.Length - (start + 1))); if (end == -1 || text[end] == '&') { // Not found OnSyntaxError(owner, errorLoc, errorLoc + 1, "Entity reference must be terminated with ';'"); // Keep '&' sb.Append('&'); curr++; continue; // Restart and next character location } string name = text.Substring(start + 1, end - (start + 1)); // Resolve the name string replacement; if (name.Length == 0) { replacement = null; OnSyntaxError(owner, errorLoc + 1, errorLoc + 1, "Entity name expected"); } else if (name == "amp") { replacement = "&"; } else if (name == "lt") { replacement = "<"; } else if (name == "gt") { replacement = ">"; } else if (name == "apos") { replacement = "'"; } else if (name == "quot") { replacement = "\""; } else if (name.Length > 0 && name[0] == '#') { int num; if (name.Length > 1 && name[1] == 'x') { if (!int.TryParse(name.Substring(2), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture.NumberFormat, out num)) { num = -1; OnSyntaxError(owner, errorLoc + 3, errorLoc + 1 + name.Length, "Hexadecimal code of unicode character expected"); } } else { if (!int.TryParse(name.Substring(1), NumberStyles.None, CultureInfo.InvariantCulture.NumberFormat, out num)) { num = -1; OnSyntaxError(owner, errorLoc + 2, errorLoc + 1 + name.Length, "Numeric code of unicode character expected"); } } if (num != -1) { try { replacement = char.ConvertFromUtf32(num); } catch (ArgumentOutOfRangeException) { replacement = null; OnSyntaxError(owner, errorLoc + 2, errorLoc + 1 + name.Length, "Invalid unicode character U+{0:X} ({0})", num); } } else { replacement = null; } } else if (!IsValidName(name)) { replacement = null; OnSyntaxError(owner, errorLoc + 1, errorLoc + 1, "Invalid entity name"); } else { replacement = null; if (parser.UnknownEntityReferenceIsError) { OnSyntaxError(owner, errorLoc, errorLoc + 1 + name.Length + 1, "Unknown entity reference '{0}'", name); } } // Append the replacement to output if (replacement != null) { sb.Append(replacement); } else { sb.Append('&'); sb.Append(name); sb.Append(';'); } curr = end + 1; continue; } } #endregion } }